# Compute code statistics history for NWB repositories

**This notebook requires:**

 * CLOC command-line tool must be installed
 * GitPython (pip install GitPython)
 * matplotlib, numpy, pandas, yaml
 
**This notebook computes:**

 * CLOC stats for the last commit on each day for all NWB code repositories
 * Plots the results for all repos
 
**NOTE:** Computing these statistics is done by checking out all repositorires and then iterating over all commits in a repo, checking out the repo for the last commit on each day, and then computing CLOC. As such, computing these statistics is time consuming. The results can be cached to YAML for further processing and to save time when rerunning and editing the notebook. 

**NOTE:** Results in the ``output_dir`` may be erased and/or modified any time the script is rerun. If results need to preserved then copy any relevant files before rerunning the notebook. 

In [None]:
from nwb_project_analytics.codestats import GitCodeStats
from nwb_project_analytics.gitstats import NWBGitInfo, GitRepos

In [None]:
import os 
import shutil
import subprocess
import git
import time
import ruamel.yaml as yaml
import numpy as np
import matplotlib as mpl
from datetime import datetime
from matplotlib import pyplot as plt
from matplotlib import cm as cm
from collections import OrderedDict
import pandas as pd
%matplotlib inline

## 1. Define script settings

In this section we can update the main settings for the analyses in this notebook. Settings (e.g., color choices) specific to a plot appear with the corresponing plotting sections.

In [None]:
cloc_path = "/opt/homebrew/bin/cloc" #/Users/oruebel/Devel/Libraries/cloc/cloc"
data_dir = os.path.join(os.getcwd(), '../data')
plot_dir = os.path.join(os.getcwd(), 'plots')
load_cached_results = True  # load the cloc results from yaml
cache_results = True  # save the cloc results to yaml
show_hackathons = False # Remove the hackathons from the plots
save_figs = True   # Save the plots to file
start_date=None    # use the default start date NWBGitInfo.NWB2_START_DATE
end_date=None      # use the default end date datetime.today() 
date_range = pd.date_range(
            start=NWBGitInfo.NWB2_START_DATE if start_date is None else start_date,
            end=datetime.today() if end_date is None else end_date,
            freq="D")

## 2. Compute the code statistics for all NWB repos

Comute the `GitCodeStats` with the statistics for all NWB repos. Based on those results also compile the summary of LOC statistics across repos by catagories: `blank`, `comment`, `code`, `nFiles`, `size`. The summary statistics align and expand results from all repos as pandas.DataFrame tables so that we can conveniently plot them together. The summary statistics are defined with a continoues date range and expand the results from all repos to align with the common time axis. For dates where no new CLOC stats are recorded for a repo, the statistics from the previous time are carried forward to fill in the gaps.

In [None]:
git_code_stats, summary_stats, per_repo_lang_stats, languages_used_all = GitCodeStats.from_nwb(
    cache_dir=data_dir,
    cloc_path=cloc_path,
    start_date=start_date,  # if None use the default start date NWBGitInfo.NWB2_START_DATE
    end_date=end_date,      # if None use the default end date datetime.today() 
    read_cache=load_cached_results,  # use the cached results if available
    write_cache=cache_results        # cache new results to cache_dir
)

In [None]:
print("Repository Keys:")
print(summary_stats['codes'].keys().values)

## 3. Plot summary of the lines of code across all NWB repos


In [None]:
# Define the grouping of the repos
summary_plot_repos_grouped = OrderedDict()
summary_plot_repos_grouped['NWB APIs'] = ['PyNWB', 'MatNWB', 'AqNWB']
summary_plot_repos_grouped['Data Modeling'] = ['HDMF', 'HDMF_Zarr', 'HDMF_Schema_Language', 'NWB_Schema_Language']
summary_plot_repos_grouped['Extension Tools'] = ['NDX_Catalog', 'NDX_Template', 'NDX_Extension_Smithy', 'NDX_Staged_Extensions', 'HDMF_DocUtils']
summary_plot_repos_grouped['Format Schema'] = ['NWB_Schema', 'HDMF_Common_Schema']
summary_plot_repos_grouped['Data Conversion'] = ['NeuroConv', 'NWBInspector', 'NWB_GUIDE']
summary_plot_repos_grouped['Cloud'] = ['LINDI', 'NWB_Benchmarks', 'NWBWidgets']
summary_plot_repos_grouped['Online Resources'] = ['NWB_Overview', 'NWB_Project_Analytics', 'Hackathons']

In [None]:
# Create flat list of repos
summary_plot_repos = [repo 
                      for repo_type in summary_plot_repos_grouped
                      for repo in summary_plot_repos_grouped[repo_type]
                     ] 

In [None]:
# Define base colors for each category
base_colors = {
    'NWB APIs':       (0.121, 0.466, 0.705, 1.0),  # Blue
    'Data Modeling':  (1.000, 0.843, 0.000, 1.0),  # Gold/Yellow
    'Data Conversion':(0.200, 0.627, 0.172, 1.0),  # Green
    'Extension Tools':(1.000, 0.498, 0.054, 1.0),  # Orange
    'Format Schema':  (0.839, 0.153, 0.157, 1.0),  # Red
    'Cloud':          (0.580, 0.404, 0.741, 1.0),  # Purple
    'Online Resources':(0.549, 0.337, 0.294, 1.0), # Brown
}

### 3.1. Version 1: Using default colors for repos

In [None]:
# Create colors such that each repo is assigned a distinct color
evenly_spaced_interval = np.linspace(0, 1, len(summary_plot_repos))
colors = [cm.tab20(x) for x in evenly_spaced_interval]
#colors = [cm.Paired(x) for x in evenly_spaced_interval]
# mix up colors so that neighbouring areas have more dissimilar colors
colors = [c for i, c in enumerate(colors) if i % 2 == 0] + [c for i, c in enumerate(colors) if i % 2 == 1]

In [None]:
ax = summary_stats['sizes'][summary_plot_repos].plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24, 
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.legend(loc=2, prop={'size': 16})
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all.pdf'))
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all.png'), dpi=300)
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

### 3.2 Group by color type but keep repos separate

In [None]:
## Generate colors to visually group all repos by color and distinguish repos within
## each category based on their alpha value
# Function to generate colors with varying alpha values
def generate_colors(base_color, num_colors):
    r, g, b, _ = base_color
    alpha_step = 0.7 / (num_colors - 1) if num_colors > 1 else 0.7
    return [(r, g, b, max(0.3, 1.0 - i * alpha_step)) for i in range(num_colors)]

# Generate colors for each category
colors = []
for category, repos in summary_plot_repos_grouped.items():
    base_color = base_colors[category]
    category_colors = generate_colors(base_color, len(repos))
    colors.extend(category_colors)

In [None]:
ax = summary_stats['sizes'][summary_plot_repos].plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24, 
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.get_legend().remove()
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
plt.legend(loc=2, prop={'size': 16,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all_grouped.pdf'))
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all_grouped.png'), dpi=300)
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

### 3.3 Combine repos into broad categories

In [None]:
# Create DataFrame with the total lines of code for each category (instead of for each repo)
repo_sizes_grouped = OrderedDict()
for category, repos in summary_plot_repos_grouped.items():
    category_size = None
    for repo in repos:
        if category_size is None:
            category_size = summary_stats['sizes'][repo]
        else:
            category_size += summary_stats['sizes'][repo]
    repo_sizes_grouped [category] = category_size
repo_sizes_grouped_df = pd.DataFrame.from_dict(repo_sizes_grouped)

In [None]:
# colors = [(78, 92, 150), (81, 133, 189), (155, 187, 89), (115, 147, 49), (191, 80, 77), (207, 130, 58)]
# colors = [ (c[0]/255.0, c[1]/255.0, c[2]/255.0, 1.0) for c in colors]
colors = [(c[0], c[1], c[2], 0.8)    for c in base_colors.values()]

ax = repo_sizes_grouped_df.plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=24,
    color=colors)
ax.get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.get_legend().remove()
plt.ylabel('Lines of Code', fontsize=24)
plt.xlabel('Date', fontsize=24)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.tight_layout()
plt.legend(loc=2, prop={'size': 24,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)
if save_figs:
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_grouped.pdf'))
    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_grouped.png'), dpi=300)
    
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.show()

## 4. Plot per-repo stats
### 4.1 Plot total lines of code statistics broken down by: code, blank, comment

In [None]:
for k in summary_stats['codes'].keys():
    curr_df = pd.DataFrame.from_dict({'code': summary_stats['codes'][k], 
                                      'blank': summary_stats['blanks'][k], 
                                      'comment': summary_stats['comments'][k]})
    curr_df.index = date_range
    curr_df.plot.area(
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16)
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(plot_dir, '%s_loc.pdf' % k))
    plt.show()

## 4.2 Per-repo total lines of code statistics broken down by language type

In [None]:
# Create unique colors per language so we can be consistent across plots
evenly_spaced_interval = np.linspace(0, 1, len(languages_used_all))
language_colors = {languages_used_all[i]:cm.jet(x) #tab20(x) 
                   for i, x in enumerate(evenly_spaced_interval)}
# Iterate through all repos and plot the per-language LOC stats for each repo
for k, v in per_repo_lang_stats.items():
    v.plot.area( 
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16,
        color = [language_colors[l] for l in v.columns]
    )
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(plot_dir, '%s_language_loc.pdf' % k))
    plt.show()