In [None]:
import os 
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))
from gitstats import NWBGitInfo

In [None]:
from github import Github, Label
import pandas as pd
from tqdm.notebook import tqdm
from datetime import datetime
from collections import defaultdict
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Configure inputs

In [None]:
DEV_USERNAMES = NWBGitInfo.CORE_DEVELOPERS # issues raised by core devs are excluded from analysis
REPOS = NWBGitInfo.CORE_API_REPOS # which repos to use. Set to NWBGitInfo.CORE_API_REPOS to use only main API NWB repos. Set to NWBGitInfo.GIT_REPOS to use all main NWB 2 repos
START = NWBGitInfo.NWB1_DEPRECATION_DATE  # Set datetime to filter issues older than START. E.g., set to  datetime(2021, 5, 1)

We need an API key from GitHub to access the API. See https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token how to generate an access token.

**WARNING:** Never check in a token to the repo
**WARNING:** Only use tokens with read only access (never write access) to avoid accidental changes to the repo

In [None]:
with open('ghk.txt', 'r') as f:
    API_KEY = f.read().rstrip("\n")
g = Github(API_KEY)

# Retrieve all issues

# Compile dataframe for issues

In [None]:
issues_dfs = {repo: repo.get_issues_as_dataframe(date_threshold=START, github_obj=g, tqdm=tqdm)
              for repo in tqdm(REPOS.values(), position=0, desc='repos')}

In [None]:
# save the results to file
for repo, issues in issues_dfs.items():
    issues.to_csv("issue_responses_%s.csv" % repo.repo)

## Median Response time analysis

In [None]:
for repo, idf in issues_dfs.items():
    # Exclude enhancements and help wanted issues. Also exclude issues by core developers and issues that were orignially created after START
    query = ~idf.is_enhancement & ~idf.is_help_wanted & ~idf.user_login.isin(DEV_USERNAMES) & (idf.created_at >= START)
    res = idf[query]
    print(repo)
    display(res)
    if len(res) > 0:
        try:
            res.hist(column='time_to_response')
            plt.show()
        except:
            pass

# Unresponded issues

In [None]:
for repo, idf in issues_dfs.items():
    query = pd.isna(idf.response_time)
    res = idf[query]
    print(repo)
    display(res)

# Issues by label

In [None]:
fontsize = 16
save_figs = True
curr_date = datetime.now().strftime("%Y-%m-%d")
output_dir = os.getcwd()
num_issues = {standard_label.label: [] for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values()}
num_issues['No labels'] = []
num_issues['Custom labels'] = []
for repo, idf in issues_dfs.items():
    # Compute counts for standard issues
    for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values():
        # Compute a binary vector indicating which issues have the given label 
        rows = idf.labels.apply(lambda x: standard_label.label in [l.name for l in x])
        num_issues[standard_label.label].append(np.sum(rows))
    # Compute count of issues with no label
    rows = idf.labels.apply(lambda x: len(x) == 0)
    num_issues['No labels'].append(np.sum(rows))
    # Compute count of issues with non-standard labels
   
    def contains_nonstandard_label(labels):
        standard_labels = [standard_label.label for standard_label in NWBGitInfo.STANDARD_ISSUE_LABELS.values()] + ['help wanted: good first issue']
        for label in labels:
            if label.name not in standard_labels and not label.name.startswith('topic: '):
                return True
        return False
    rows = idf.labels.apply(contains_nonstandard_label)
    num_issues['Custom labels'].append(np.sum(rows))
num_issues_df = pd.DataFrame.from_dict(num_issues)
num_issues_df.index = ["%s (%i)" % (repo.repo, len(idf)) for repo, idf in issues_dfs.items()]
num_issues_df.transpose().plot.barh(stacked=True, figsize=(14,10), fontsize=fontsize, rot=0)
plt.xlabel("Number of Issues (incl. PRs)", fontsize=fontsize)
plt.ylabel("Issue label", fontsize=fontsize)
plt.legend(fontsize=fontsize, loc='lower right')
plt.title("Number of issues per standard label (%s)" % curr_date, fontsize=fontsize)
plt.tight_layout()
if save_figs:
    plt.savefig(os.path.join(output_dir, 'nwb_issues_by_label_%s.pdf' % curr_date))
    plt.savefig(os.path.join(output_dir, 'nwb_issues_by_label_%s.png' % curr_date))
plt.show()