# Compute code statistics history for NWB repositories

**This notebook requires:**

 * CLOC command-line tool must be installed
 * GitPython (pip install GitPython)
 * matplotlib, numpy, pandas, yaml
 
**This notebook computes:**

 * CLOC stats for the last commit on each day for all NWB code repositories
 * Plots the results for all repos
 
**NOTE:** Computing these statistics is done by checking out all repositorires and then iterating over all commits in a repo, checking out the repo for the last commit on each day, and then computing CLOC. As such, computing these statistics is time consuming. The results can be cached to YAML for further processing and to save time when rerunning and editing the notebook. 

**NOTE:** Results in the ``output_dir`` may be erased and/or modified any time the script is rerun. If results need to preserved then copy any relevant files before rerunning the notebook. 

In [None]:
from nwb_project_analytics.codestats import GitCodeStats
from nwb_project_analytics.gitstats import NWBGitInfo, GitRepos

In [None]:
import os 
import shutil
import subprocess
import git
import time
import ruamel.yaml as yaml
import numpy as np
import matplotlib as mpl
from datetime import datetime
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import pandas as pd
%matplotlib inline

## 1. Define script settings

In this section we can update the main settings for the analyses in this notebook. Settings (e.g., color choices) specific to a plot appear with the corresponing plotting sections.

In [None]:
cloc_path = "/opt/homebrew/bin/cloc" #/Users/oruebel/Devel/Libraries/cloc/cloc"
data_dir = os.path.join(os.getcwd(), 'data')
plot_dir = os.path.join(os.getcwd(), 'plots')
load_cached_results = True  # load the cloc results from yaml
cache_results = True  # save the cloc results to yaml
show_hackathons = False # Remove the hackathons from the plots
save_figs = True   # Save the plots to file
start_date=None    # use the default start date NWBGitInfo.NWB2_START_DATE
end_date=None      # use the default end date datetime.today() 
date_range = pd.date_range(
            start=NWBGitInfo.NWB2_START_DATE if start_date is None else start_date,
            end=datetime.today() if end_date is None else end_date,
            freq="D")

# Select the repos and their order for the summary plot with the lines of code
summary_plot_repos = [
     'PyNWB', 'HDMF', 'MatNWB',
     'NWB_Schema_Language', 'NWB_Schema', 
     'HDMF_Common_Schema', 'HDMF_DocUtils', 'HDMF_Zarr',
     'NDX_Catalog', 'NDX_Template', 'NDX_Staged_Extensions', 'NDX_Extension_Smithy',
     'NWBWidgets', 'NWBInspector',
     'NeuroConv']

## 2. Compute the code statistics for all NWB repos

Comute the `GitCodeStats` with the statistics for all NWB repos. Based on those results also compile the summary of LOC statistics across repos by catagories: `blank`, `comment`, `code`, `nFiles`, `size`. The summary statistics align and expand results from all repos as pandas.DataFrame tables so that we can conveniently plot them together. The summary statistics are defined with a continoues date range and expand the results from all repos to align with the common time axis. For dates where no new CLOC stats are recorded for a repo, the statistics from the previous time are carried forward to fill in the gaps.

In [None]:
git_code_stats, summary_stats = GitCodeStats.from_nwb(
    cache_dir=data_dir,
    cloc_path=cloc_path,
    start_date=start_date,  # if None use the default start date NWBGitInfo.NWB2_START_DATE
    end_date=end_date,      # if None use the default end date datetime.today() 
    read_cache=load_cached_results,  # use the cached results if available
    write_cache=cache_results        # cache new results to cache_dir
)

## 3. Plot summary of the lines of code across all NWB repos
### 3.1. Plot version 1: Using default colors for repos

In [None]:
evenly_spaced_interval = np.linspace(0, 1, len(summary_plot_repos))
#colors = [cm.tab20(x) for x in evenly_spaced_interval]
colors = [cm.Paired(x) for x in evenly_spaced_interval]
# mix up colors so that neighbouring areas have more dissimilar colors
colors = [c for i, c in enumerate(colors) if i % 2 == 0] + [c for i, c in enumerate(colors) if i % 2 == 1]
ax = summary_stats['sizes'][summary_plot_repos].plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24, 
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.legend(loc=2, prop={'size': 20})
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all.pdf'))
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

### 3.1 Plot grouped summary of the lines of code across all NWB repos 

For the paper we want to group tools to ease overview.

In [None]:
# Sort repos so we can group them category
summary_plot_repos_accum = [
     'NWB_Schema', 'HDMF_Common_Schema', 'NWB_Schema_Language', 
     'PyNWB', 
     'HDMF',
     'MatNWB',
     'HDMF_DocUtils', 'NWBWidgets', 'NWBInspector',
     'HDMF_Zarr', 'NeuroConv',
     'NDX_Catalog', 'NDX_Template', 'NDX_Staged_Extensions', 'NDX_Extension_Smithy']
colors = [(0.7, 0.0, 0.0, 1.0), (0.7, 0.0, 0.0, 0.6), (0.7, 0.0, 0.0, 0.4),
          (0.0, 0.5, 0.6, 1.0),
          (0.0, 0.75, 0.85, 1.0),
          (0.4, 1.0, 1.0 ,1.0),
          (0.8, 0.4, 0.0, 1.0), (0.8, 0.4, 0.0, 0.7), (0.8, 0.4, 0.0, 0.5),
          (0.8, 0.8, 0.2, 1.0), (0.8, 0.8, 0.2, 0.5),
          (0.0, 0.0, 0.7, 1.0), (0.0, 0.0, 0.7, 0.85), (0.0, 0.0, 0.7, 0.7), (0.0, 0.0, 0.7, 0.55)]
ax = summary_stats['sizes'][summary_plot_repos_accum].plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24, 
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.get_legend().remove()
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
plt.legend(loc=2, prop={'size': 20,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all_grouped.pdf'))
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

In [None]:
repo_sizes_grouped_df = pd.DataFrame.from_dict(
    {'Format Schema': (summary_stats['sizes']['NWB_Schema'] + 
                       summary_stats['sizes']['HDMF_Common_Schema'] + 
                       summary_stats['sizes']['NWB_Schema_Language']),
     'HDMF': summary_stats['sizes']['HDMF'],
     'PyNWB': summary_stats['sizes']['PyNWB'],
     'MatNWB': summary_stats['sizes']['MatNWB'],
     'NWB Tools': (summary_stats['sizes']['HDMF_DocUtils'] + 
                   summary_stats['sizes']['NWBWidgets'] + 
                   summary_stats['sizes']['NWBInspector'] + 
                   summary_stats['sizes']['HDMF_Zarr'] + 
                   summary_stats['sizes']['NeuroConv']),
     'NDX Catalog': (summary_stats['sizes']['NDX_Catalog'] +
                     summary_stats['sizes']['NDX_Template'] +
                     summary_stats['sizes']['NDX_Staged_Extensions'] +
                     summary_stats['sizes']['NDX_Extension_Smithy']),
    }
)
colors = [(78, 92, 150), (81, 133, 189), (155, 187, 89), (115, 147, 49), (191, 80, 77), (207, 130, 58)]
colors = [ (c[0]/255.0, c[1]/255.0, c[2]/255.0, 1.0) for c in colors]
ax = repo_sizes_grouped_df.plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24,
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.get_legend().remove()
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
plt.legend(loc=2, prop={'size': 24,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_grouped.pdf'))
    
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

## 4. Plot per-repo total lines of code statistics broken down by: code, blank, comment

In [None]:
for k in summary_stats['codes'].keys():
    curr_df = pd.DataFrame.from_dict({'code': summary_stats['codes'][k], 
                                      'blank': summary_stats['blanks'][k], 
                                      'comment': summary_stats['comments'][k]})
    curr_df.index = date_range
    curr_df.plot.area(
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16)
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(plot_dir, '%s_loc.pdf' % k))
    plt.show()

## 4. Per-repo total lines of code statistics broken down by language type
### 4.1. Compute the per-repo language statistics

In [None]:
# Iterate through all repos 
ignore_lang = ['SUM', 'header']
languages_used_all = git_code_stats.get_languages_used(ignore_lang)
per_repo_lang_stats = git_code_stats.compute_language_stats(ignore_lang)

### 4.2. Plot the per-repo total lines of code statistics broken down by language type

In [None]:
# Create unique colors per language so we can be consistent across plots
evenly_spaced_interval = np.linspace(0, 1, len(languages_used_all))
language_colors = {languages_used_all[i]:cm.jet(x) #tab20(x) 
                   for i, x in enumerate(evenly_spaced_interval)}
# Iterate through all repos and plot the per-language LOC stats for each repo
for k, v in per_repo_lang_stats.items():
    v.plot.area( 
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16,
        color = [language_colors[l] for l in v.columns]
    )
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(plot_dir, '%s_language_loc.pdf' % k))
    plt.show()