In [None]:
from nwb_project_analytics.gitstats import NWBGitInfo

In [None]:
import os
from github import Github, Label
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime
from collections import defaultdict
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Configure inputs

In [None]:
# save results to
output_dir = os.path.join(os.getcwd(), 'plots/') 
# save figures
save_figs = True
# exclude issues raised by core devs from response-time analysis
DEV_USERNAMES = NWBGitInfo.CORE_DEVELOPERS 
# which repos to use. 
# Set to NWBGitInfo.CORE_API_REPOS to use only main API NWB repos. 
# Set to NWBGitInfo.GIT_REPOS to use all main NWB 2 repos
REPOS =  NWBGitInfo.CORE_API_REPOS # NWBGitInfo.CORE_API_REPOS[0:1] #
# Set datetime to filter issues older than START. E.g., set to  datetime(2021, 5, 1)
START = datetime(2022, 1, 1) # NWBGitInfo.NWB1_DEPRECATION_DATE 
start_str = START.strftime("%Y-%m-%d")
end_str = datetime.now().strftime("%Y-%m-%d")
# threshold for long issues
long_issue_threshold = 10

We need an API key from GitHub to access the API. See https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token how to generate an access token.

**WARNING:** Never check in a token to the repo
**WARNING:** Only use tokens with read only access (never write access) to avoid accidental changes to the repo

In [None]:
with open('ghk.txt', 'r') as f:
    API_KEY = f.read().rstrip("\n")
g = Github(API_KEY)

# Retrieve all issues

# Compile dataframe for issues

In [None]:
issues_dfs = {repo: repo.get_issues_as_dataframe(since=START, github_obj=g, tqdm=tqdm)
              for repo in tqdm(REPOS.values(), position=0, desc='repos')}

In [None]:
# save the results to file
for repo, issues in issues_dfs.items():
    issues.to_csv(os.path.join(output_dir, "issue_responses_%s.csv") % repo.repo)

## Plot issue response times

Issue response time in days is computed via the function ``GitRepo.compute_issue_time_of_first_response`` and is defined by the first event in the issue timeline where someone other than the creator of the issue either added a comment or label to the event or if the issue was closed. 

In [None]:
# Plot stacked histogram plot for issues of all core repos
response_time_data = []
response_time_labels = []
num_long_issues = []
for repo, issues in issues_dfs.items():
    # issue response times
    response_times_for_new_issues = issues[issues['created_at'] > START]['days_to_response']
    response_time_data.append(response_times_for_new_issues)
    num_long_issues.append(np.sum(np.array(response_times_for_new_issues) > long_issue_threshold))
    median_time = np.nanmedian(response_times_for_new_issues)
    total_issues = len(response_times_for_new_issues)
    label = "%s \n  * #issues=%i\n  * median=%f days)" % (repo.repo, total_issues, median_time)
    response_time_labels.append(label)

plt.figure(figsize=(8, 5))    
plt.hist(response_time_data, 
         bins=np.max([np.max(response_time_data[i]) for i in range(len(response_time_data))]).astype(int),
         label=response_time_labels,
         density=False, 
         histtype='bar', 
         stacked=True)
plt.xlabel("Initial response time in days")
plt.ylabel("#Issues")
plt.legend()
plt.xlim((0, long_issue_threshold))
plt.title("Issue response time: %s - %s" % (start_str, end_str))

if save_figs:
    plt.savefig(os.path.join(output_dir, 'NWB_ALL_issue_response_times_%s_%s.pdf' % (start_str, end_str)))
    plt.savefig(os.path.join(output_dir, 'NWB_ALL_issue_response_times_%s_%s.png' % (start_str, end_str)))
plt.show()

print("#Issue with response time > %i days" % long_issue_threshold)
for i, n in enumerate(num_long_issues):
    print("%s: %i" % (response_time_labels[i].split("\n")[0], n))
total_num_issues = np.sum([len(rtd) for rtd in response_time_data])
total_long_issues = np.sum(num_long_issues)
print("Total issues: %i" % total_num_issues)
print("Total long issues: %i" % total_long_issues)
print("Percent long issues: %f" % (float(total_long_issues)/float(total_num_issues) * 100)   )

## Show issues with long estimated response times that were created after the START date

In [None]:
for repo, issues in issues_dfs.items():
    long_issues = issues[(issues['created_at'] > START) & (issues['days_to_response'] > long_issue_threshold)]
    print(repo.repo, len(long_issues))
    display(long_issues)

## Issue summary plots for the individual repositories

In [None]:
for repo, issues in issues_dfs.items():
    for k in ['updated_at', 'created_at', 'closed_at']:
        issues.groupby([issues[k].dt.year , issues[k].dt.month])[k].count().plot(kind="bar", figsize=(8,4))
        plt.show()
    
    # issue response times
    response_times_for_new_issues = issues[issues['created_at'] > START]['days_to_response']
    plt.hist(response_times_for_new_issues, bins=np.ceil(np.max(response_times_for_new_issues)).astype('int'))
    #plt.xlim((0,10))
    plt.show()

## Median Response time analysis

In [None]:
for repo, idf in issues_dfs.items():
    # Exclude enhancements and help wanted issues. Also exclude issues by core developers and issues that were orignially created after START
    query = ~idf.is_enhancement & ~idf.is_help_wanted & ~idf.user_login.isin(DEV_USERNAMES) & (idf.created_at >= START)
    res = idf[query]
    print(repo)
    display(res)
    if len(res) > 0:
        #try:
        res.hist(column='days_to_response')
        plt.title(repo.repo)
        plt.xlabel("Days to response")
        plt.ylabel("Count")
        plt.show()
        #except:
        #    pass

# Unresponded issues

In [None]:
for repo, idf in issues_dfs.items():
    query = pd.isna(idf.response_time)
    res = idf[query]
    print(repo)
    display(res)

# Issues by label

In [None]:
fontsize = 16
save_figs = True
curr_date = datetime.now().strftime("%Y-%m-%d")
num_issues = {standard_label.label: [] for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values()}
num_issues['No labels'] = []
num_issues['Custom labels'] = []
for repo, idf in issues_dfs.items():
    # Compute counts for standard issues
    for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values():
        # Compute a binary vector indicating which issues have the given label 
        rows = idf.labels.apply(lambda x: standard_label.label in [l.name for l in x])
        num_issues[standard_label.label].append(np.sum(rows))
    # Compute count of issues with no label
    rows = idf.labels.apply(lambda x: len(x) == 0)
    num_issues['No labels'].append(np.sum(rows))
    # Compute count of issues with non-standard labels
   
    def contains_nonstandard_label(labels):
        standard_labels = [standard_label.label for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values()] + ['help wanted: good first issue']
        for label in labels:
            if label.name not in standard_labels and not label.name.startswith('topic: '):
                return True
        return False
    rows = idf.labels.apply(contains_nonstandard_label)
    num_issues['Custom labels'].append(np.sum(rows))
num_issues_df = pd.DataFrame.from_dict(num_issues)
num_issues_df.index = ["%s (%i)" % (repo.repo, len(idf)) for repo, idf in issues_dfs.items()]
num_issues_df.transpose().plot.barh(stacked=True, figsize=(14,10), fontsize=fontsize, rot=0)
plt.xlabel("Number of Issues (incl. PRs)", fontsize=fontsize)
plt.ylabel("Issue label", fontsize=fontsize)
plt.legend(fontsize=fontsize, loc='lower right')
plt.title("Number of issues per standard label (%s)" % curr_date, fontsize=fontsize)
plt.tight_layout()
if save_figs:
    plt.savefig(os.path.join(output_dir, 'nwb_issues_by_label_%s.pdf' % curr_date))
    plt.savefig(os.path.join(output_dir, 'nwb_issues_by_label_%s.png' % curr_date))
plt.show()