# Compute code statistics history for NWB repositories

**This notebook requires:**

 * CLOC command-line tool must be installed
 * GitPython (pip install GitPython)
 * matplotlib, numpy, pandas, yaml
 
**This notebook computes:**

 * CLOC stats for the last commit on each day for all NWB code repositories
 * Plots the results for all repos
 
**NOTE:** Computing these statistics is done by checking out all repositorires and then iterating over all commits in a repo, checking out the repo for the last commit on each day, and then computing CLOC. As such, computing these statistics is time consuming. The results can be cached to YAML for further processing and to save time when rerunning and editing the notebook. 

**NOTE:** Results in the ``output_dir`` may be erased and/or modified any time the script is rerun. If results need to preserved then copy any relevant files before rerunning the notebook. 

In [None]:
import os 
import shutil
import subprocess
import git
import time
import yaml
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import pandas as pd
%matplotlib inline

## 1. Define script settings

In this section we can update the main settings for the analyses in this notebook. Settings (e.g., color choices) specific to a plot appear with the corresponing plotting sections.

In [None]:
cloc_path = "/Users/oruebel/Devel/Libraries/cloc/cloc"
git_paths = {'PyNWB': "https://github.com/NeurodataWithoutBorders/pynwb.git", 
             'MatNWB': "https://github.com/NeurodataWithoutBorders/matnwb.git",
             'NWBWidgets': "https://github.com/NeurodataWithoutBorders/nwb-jupyter-widgets.git",
             'NWBInspector': "https://github.com/NeurodataWithoutBorders/nwbinspector.git",
             'Hackathons': "https://github.com/NeurodataWithoutBorders/nwb_hackathons.git",
             'NWB_Schema': "https://github.com/NeurodataWithoutBorders/nwb-schema.git",
             'NWB_Schema_Language': "https://github.com/NeurodataWithoutBorders/nwb-schema-language.git",
             'HDMF': 'https://github.com/hdmf-dev/hdmf.git',
             'HDMF_Common_Schema': 'https://github.com/hdmf-dev/hdmf-common-schema.git',
             'HDMF_DocUtils': 'https://github.com/hdmf-dev/hdmf-docutils.git',
             # 'HDMF Schema Language' : https://github.com/hdmf-dev/hdmf-schema-language
             'NDX_Template': 'https://github.com/nwb-extensions/ndx-template.git',
             'NDX_Staged_Extensions': 'https://github.com/nwb-extensions/staged-extensions.git',
             #'NDX Webservices': 'https://github.com/nwb-extensions/nwb-extensions-webservices.git',
             'NDX_Catalog': 'https://github.com/nwb-extensions/nwb-extensions.github.io.git',
             'NDX_Extension_Smithy': 'https://github.com/nwb-extensions/nwb-extensions-smithy',
             'NWB_1.x_Matlab': 'https://github.com/NeurodataWithoutBorders/api-matlab.git',
             'NWB_1.x_Python': 'https://github.com/NeurodataWithoutBorders/api-python.git'
            }
output_dir = os.path.join(os.getcwd(), 'temp_cloc_stats')
source_dir = os.path.join(output_dir, 'src')
cache_file_cloc = os.path.join(os.path.join(os.getcwd(), 'temp_cloc_stats'), 'cloc_stats.yaml')
cache_file_commits = os.path.join(os.path.join(os.getcwd(), 'temp_cloc_stats'), 'commit_stats.yaml')
load_cached_results = True  # load the cloc results from yaml
cache_results = True  # save the cloc results to yaml
show_NWB1 = False  # Remove NWB1 repos from the plots
show_hackathons = False # Remove the hackathons from the plots
save_figs = True   # Save the plots to file
# Set all values before this date to 0 for HDMF 2019-03-13 coincides with the removal of HDMF from PyNWB with PR #850
# and the release of HDMF 1.0. For the plotting 2019-03-13 is therefore a good date to start considering HDMF
# stats to avoid duplication of code in statistics, even though the HDMF repo existed on GitHub already since
# 2019-01-23T23:48:27Z, which could be alternatively considered as the start date. Older dates will include
# code history carried over from PyNWB to HDMF. Set to None to consider the full history of HMDF but as mentioned,
# this will lead to some duplicate counting of code before 2019-03-13
hdmf_start_date = '2019-03-13'  
# date when to declare the NWB 1.0 APIs as deprecated. The 3rd Hackathon was held on July 31 to August 1, 2017 at
# Janelia Farm, in Ashburn, Virginia, which marks the date when NWB 2.0 was officially accepted as the 
# follow-up to NWB 1.0. NWB 1.0 as a project ended about 1 year before that.
nwb1_depration_date = '2016-08-01'
# NWB_Extension_Smithy is a fork with changes. We therefore should count only the sizes after the fork data
# which based on https://api.github.com/repos/nwb-extensions/nwb-extensions-smithy is "2019-04-25T20:56:02Z",
extension_smithy_start_date = '2019-04-25'
# Select the repos and their order for the summary plot with the lines of code
summary_plot_repos = [
     'PyNWB', 'HDMF', 'MatNWB',
     'NWB_Schema_Language', 'NWB_Schema', 
     'HDMF_Common_Schema', 'HDMF_DocUtils',
     'NDX_Catalog', 'NDX_Template', 'NDX_Staged_Extensions', 'NDX_Extension_Smithy',
     'NWBWidgets', 'NWBInspector']

## 2. Functions used to interact with git and compute CLOC stats

In [None]:
def clean_outdirs(output_dir, source_dir):
    """
    Delete the output directory and all its contents and create a new clean directory.
    
    :returns: A tuple of two strings with the output_dir and source_dir for git sources
    """
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    os.mkdir(source_dir)
    return output_dir, source_dir

def clone_repos(repos, source_dir):
    """
    Clone all of the given repositories. 
    
    :param repos: Dict where the keys are the names of the repos and the
                  values are the git source path to clone
    :param source_dir: Directory where all the git repos should be cloned to.
                  Each repo will be cloned into a subdirectory in source_dir
                  that is named after the corresponding key in the repos dict.
    :returns: Dict where the keys are the same as in repos but the values
              are instances of git.repo.base.Repo pointing to the corresponding
              git repository.
    """
    git_repos = {}
    for k, v in repos.items():
        print("Cloning: %s" % k)
        git_repos[k] = git.Repo.clone_from(v, os.path.join(source_dir, k))
    return git_repos

def run_cloc(cloc_path, src_dir, out_file):
    """
    Run CLOC on the given srcdir, save the results to outdir, and return the parsed
    results.
    """
    command = "%s --yaml --report-file=%s %s" % (cloc_path, out_file, src_dir)
    os.system(command)
    with open(out_file) as f:
        res = yaml.load(f, Loader=yaml.FullLoader)
    return res

def git_repo_stats(repo, cloc_path, output_dir):
    """
    :param repo: The git repository to process
    :type repo: git.repo.base.Repo
    
    :returns: List of dicts with information about all commits. The list
              is sorted in time from most current [0] to oldest [-1]
    """
    # Get hexsha and data of all commits
    commit_stats = []
    # Commits are sorted in time from newest to oldest
    for commit in repo.iter_commits():
        commit_stats.append(
            {'time': time.asctime(time.gmtime(commit.committed_date)),
             'hexsha': commit.hexsha,
             'author': commit.author.name,
             'committer': commit.committer.name,
             'summary': commit.summary,
             'commit': commit})
    # iterate through all the commits in order and compute the cloc stats
    cloc_stats = []
    for commit in commit_stats:
        date = time.strftime("%d %b %Y", time.gmtime(commit['commit'].committed_date))
        # Run cloc only for the last commit on each day
        if len(cloc_stats) == 0 or date != cloc_stats[-1]['date']:
            cloc_res = {'hexsha': commit['hexsha'], 'date': date, 'time': commit['time']}
            repo.git.checkout(commit['hexsha'])
            cloc_yaml = os.path.join(
                output_dir, 
                "%s.yaml" % os.path.basename(repo.working_dir))
                #"%s_%s.yaml" % (os.path.dirname(repo.working_dir), commit['hexsha']))
            cloc_res['cloc'] = run_cloc(
                cloc_path=cloc_path, 
                src_dir=repo.working_dir, 
                out_file=cloc_yaml)
            os.remove(cloc_yaml) # Remove the yaml file, we don't need it
            cloc_stats.append(cloc_res)
        # drop the commit from the dict to make sure we can save things in YAML
        commit.pop('commit', None)
    return commit_stats, cloc_stats

## 3. Compute the code statistics for all repos

In [None]:
# Load cached results if available
if load_cached_results and os.path.exists(cache_file_cloc) and os.path.exists(cache_file_commits):
    print("Loading cached results: %s" % cache_file_cloc)
    with open(cache_file_cloc) as f:
        cloc_stats = yaml.load(f, Loader=yaml.FullLoader)
    print("Loading cached results: %s" % cache_file_commits)
    with open(cache_file_commits) as f:
        commit_stats = yaml.load(f, Loader=yaml.FullLoader)
# Compute the results if not cached
else: 
    # Clean and create output directory
    clean_outdirs(output_dir=output_dir, 
                  source_dir=source_dir)
    # Clone all repos
    print("Cloning all repos...")
    git_repos = clone_repos(repos=git_paths, source_dir=source_dir)
    # Compute CLOC and Commit statistics for all repos
    commit_stats = {}
    cloc_stats = {}
    for name, repo in git_repos.items():
        print("Compute CLOC stats: %s" % name)
        commit_res, cloc_res = git_repo_stats(
            repo, 
            cloc_path=cloc_path, 
            output_dir=output_dir)
        commit_stats[name] = commit_res
        cloc_stats[name] = cloc_res
    # Cache the results if requested
    if cache_results:
        print("Caching results: %s" % cache_file_cloc)
        with open(cache_file_cloc, 'w') as outfile:
            yaml.dump(cloc_stats, outfile)
        print("Caching results: %s" % cache_file_commits)
        with open(cache_file_commits, 'w') as outfile:
            yaml.dump(commit_stats, outfile)

## 4. Summary of the lines of code across all NWB repos
### 4.1. Compile summary of LOC across repos by catagories: `blank`, `comment`, `code`, `nFiles`, `size`

The goal is to align and expand results from all repos so that we can plot them together. Here we create a continoues date range and expand the results from all repos to align with our common time axis. For dates where no new CLOC stats are recorded for a repo, the statistics from the previous time are carried forward to fill in the gaps.

In [None]:
# Define our reference date range depending on whether we include NWB 1 in the plots or not
if show_NWB1:
    date_range = pd.date_range(start=cloc_stats['NWB_1.x_Matlab'][-1]['date'], end=time.strftime("%d %b %Y", time.localtime()), freq="D")
else:
    date_range = pd.date_range(start=cloc_stats['PyNWB'][-1]['date'], end=time.strftime("%d %b %Y", time.localtime()), freq="D")

# Align and expand our results
repo_sizes_aligned = {}
repo_blanks_aligned = {}
repo_codes_aligned = {}
repo_comments_aligned = {}
repo_nfiles_aligned = {}
# Iterate through all repos and organize the size stats for the given date_range
for k, v in cloc_stats.items():
    # Dates and CLOC size for the current repo
    curr_dates = pd.pandas.DatetimeIndex([cloc_entry['date'] for cloc_entry in v])[::-1]
    curr_sizes = [np.sum([v for k, v in cloc_entry['cloc']['SUM'].items() if k != 'nFiles']) for cloc_entry in v][::-1]
    curr_blanks = [cloc_entry['cloc']['SUM']['blank'] for cloc_entry in v][::-1]
    curr_codes = [cloc_entry['cloc']['SUM']['code'] for cloc_entry in v][::-1]
    curr_comments = [cloc_entry['cloc']['SUM']['comment'] for cloc_entry in v][::-1]
    curr_nfiles = [cloc_entry['cloc']['SUM']['nFiles'] for cloc_entry in v][::-1]
    
    # Expand the data so we carry forward values for dates where the repo has not changed
    curr_index = 0
    curr_val_sizes = 0
    curr_val_blanks = 0
    curr_val_codes = 0
    curr_val_comments = 0
    curr_val_nfiles = 0
    expanded_sizes = []
    expanded_blanks = []
    expanded_codes = []
    expanded_comments = []
    expanded_nfiles = []
    # If our start date of the repo is before the start of our date_range,
    # then we need to search for the approbriate start values and index
    # as the repo has a valid state prior to the range we are looking at
    if date_range[0] > curr_dates[0]:
        if date_range[0] > curr_dates[-1]:
            curr_index = len(curr_dates) -1
        else:
            for di ,d in enumerate(curr_dates):
                if d > date_range[0]:
                    curr_index = di - 1
                    break
        curr_val_sizes = curr_sizes[curr_index]
        curr_val_blanks = curr_blanks[curr_index]
        curr_val_codes = curr_codes[curr_index]
        curr_val_comments = curr_comments[curr_index]
        curr_val_nfiles = curr_nfiles[curr_index]
                
    # Compute all the sizes
    for d in date_range:
        # If we found a matching date, then update the results
        # Else we'll carry-forward the previous value since the
        # repo has not changed
        if d == curr_dates[curr_index]:
            curr_val_sizes = curr_sizes[curr_index]
            curr_val_blanks = curr_blanks[curr_index]
            curr_val_codes = curr_codes[curr_index]
            curr_val_comments = curr_comments[curr_index]
            curr_val_nfiles = curr_nfiles[curr_index]
            if curr_index < (len(curr_dates) -1):
                curr_index += 1
        # Append the approbriate value for the current data d
        expanded_sizes.append(curr_val_sizes)
        expanded_blanks.append(curr_val_blanks)
        expanded_codes.append(curr_val_codes)
        expanded_comments.append(curr_val_comments)
        expanded_nfiles.append(curr_val_nfiles)
    # Save the expanded results for the current repo k
    repo_sizes_aligned[k] = np.asarray(expanded_sizes)
    repo_blanks_aligned[k] = np.asarray(expanded_blanks)
    repo_codes_aligned[k] = np.asarray(expanded_codes)
    repo_comments_aligned[k] = np.asarray(expanded_comments)
    repo_nfiles_aligned[k] = np.asarray(expanded_nfiles)
        
# Convert results to Pandas
repo_sizes_algined_df = pd.DataFrame.from_dict(repo_sizes_aligned)
repo_sizes_algined_df.index = date_range

# Clean up results to mark start date of HDMF
if hdmf_start_date is not None:
    # Set all LOC values prior to the given data to 0
    repo_sizes_algined_df['HDMF'][:hdmf_start_date] = 0
# Clean up results to mark start date for the extension smithy
if extension_smithy_start_date is not None:
     # Set all LOC values prior to the given data to 0
    repo_sizes_algined_df['NDX_Extension_Smithy'][:extension_smithy_start_date] = 0
# Clean up results to remove NWB 1.0 software from the graph after 1.0 was deprecated
if nwb1_depration_date is not None:
    repo_sizes_algined_df['NWB_1.x_Matlab'][nwb1_depration_date:] = 0
    repo_sizes_algined_df['NWB_1.x_Python'][nwb1_depration_date:] = 0

### 4.2. Plot summary of the lines of code across all NWB repos

In [None]:
evenly_spaced_interval = np.linspace(0, 1, len(summary_plot_repos))
#colors = [cm.tab20(x) for x in evenly_spaced_interval]
colors = [cm.Paired(x) for x in evenly_spaced_interval]
# mix up colors so that neighbouring areas have more dissimilar colors
colors = [c for i, c in enumerate(colors) if i % 2 == 0] + [c for i, c in enumerate(colors) if i % 2 == 1]
repo_sizes_algined_df[summary_plot_repos].plot.area(
    figsize=(18,10), 
    stacked=True, 
    linewidth=0,
    fontsize=16, 
    color=colors)
plt.legend(loc=2, prop={'size': 16})
plt.ylabel('Lines of Code (CLOC)', fontsize=16)
plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
plt.title('NWB Code Repository Sizes', fontsize=20)
plt.tight_layout()
if save_figs:
    plt.savefig(os.path.join(output_dir, 'nwb_repo_sizes_all.pdf'))
plt.show()

### 4.3. Plot per-repo total lines of code statistics broken down by: code, blank, comment

In [None]:
for k in git_paths.keys():
    curr_df = pd.DataFrame.from_dict({'code': repo_codes_aligned[k], 
                                      'blank': repo_blanks_aligned[k], 
                                      'comment': repo_comments_aligned[k]})
    curr_df.index = date_range
    curr_df.plot.area(
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16)
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(output_dir, '%s_loc.pdf' % k))
    plt.show()

## 5. Per-repo total lines of code statistics broken down by language type
### 5.1. Compute the per-repo language statistics

In [None]:
# Iterate through all repos 
ignore_lang = ['SUM', 'header']
languages_used_all = np.unique([lang for v in cloc_stats.values() for cl in v for lang in cl['cloc'].keys() if lang not in ignore_lang])
per_repo_lang_stats = {}
for k, v in cloc_stats.items():
    # languages used in the current repo
    languages_used = np.unique([lang for cl in v for lang in cl['cloc'].keys() if lang not in ignore_lang])
    # linear range of dates across the lifetime of this repo
    date_range_used = pd.date_range(start=cloc_stats[k][-1]['date'], end=time.strftime("%d %b %Y", time.localtime()), freq="D")
    curr_index = 0  # start index in the CLOC data available for the repo
    curr_values = {l: 0 for l in languages_used}  # current values to be used
    curr_dates = pd.pandas.DatetimeIndex([cloc_entry['date'] for cloc_entry in v])[::-1] # dates available in the repo
    curr_stats = {lang: [] for lang in languages_used}
    # iterate through all date values and set the repo counts
    for d in date_range_used:
        # If we found a matching date, then update the results
        # Else we'll carry-forward the previous value since the
        # repo has not changed
        if d == curr_dates[curr_index]:
            # Update the current values to report until we find curr_dates[curr_index+1]
            for lang, val in v[curr_index]['cloc'].items():
                if lang in curr_values:  # e.g., SUM is being ignored
                    curr_values[lang] = val['blank'] + val['code'] + val['code']
            # Move to the next date in the repo
            if curr_index < (len(curr_dates) -1):
                curr_index += 1
        # Copy the current values into our curr_stats dict
        for cl, cv in curr_values.items():
            curr_stats[cl].append(cv)
    # Now that we have our stats lets convert them to pandas 
    per_repo_lang_stats[k] = pd.DataFrame.from_dict(curr_stats)
    per_repo_lang_stats[k].index = date_range_used[::-1]    

### 5.2. Plot the per-repo total lines of code statistics broken down by language type

In [None]:
# Create unique colors per language so we can be consistent across plots
evenly_spaced_interval = np.linspace(0, 1, len(languages_used_all))
language_colors = {languages_used_all[i]:cm.jet(x) #tab20(x) 
                   for i, x in enumerate(evenly_spaced_interval)}
# Iterate through all repos and plot the per-language LOC stats for each repo
for k, v in per_repo_lang_stats.items():
    v.plot.area( 
        figsize=(18,10), 
        stacked=True, 
        linewidth=0, 
        fontsize=16,
        color = [language_colors[l] for l in v.columns]
    )
    plt.legend(loc=2, prop={'size': 16})
    plt.ylabel('Lines of Code (CLOC)', fontsize=16)
    plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')
    plt.title("Lines of Code: %s" % k, fontsize=20)
    plt.tight_layout()
    if save_figs:
        plt.savefig(os.path.join(output_dir, '%s_language_loc.pdf' % k))
    plt.show()