# Setup

In [1]:
# Used to read .env files, for access tokens and other secrets or configurations
%load_ext dotenv
%dotenv
import os

In [2]:
from github import Github
import pandas
from matplotlib import pyplot
import numpy
import seaborn

from datetime import datetime, timedelta

In [3]:
# The github token is stored in a .env file next to this notebook in the form "GITHUB_TOKEN = <your token here>"
github = Github(os.getenv("GITHUB_TOKEN"))

# Team Constants

These are things that describe how the team work and are static

## Team Members

In [4]:
sap = g.get_organization('SAP')
def is_company_member(named_user):
    return ('SAP' in (named_user.company or '').upper()) or named_user.login.upper().endswith('SAP') or sap.has_in_members(named_user)

NameError: name 'g' is not defined

In [None]:
cwa = g.get_organization('corona-warn-app')
team = list(filter(is_company_member, cwa.get_members()))
team

## Sprint Management

In [None]:
# In the following, we will assume that sprints are totally regular, starting with a fixed date.
# The helper functions can of course be adapted to represent, e.g., changes in sprint duration or holiday breaks

start_date_first_sprint = datetime(2020, 4, 27) # It's a monday, development actually started 02.05.2020
sprint_length = timedelta(weeks=2)

def date_to_sprint(date_to_convert):
    return max(0, (date_to_convert - start_date_first_sprint) // sprint_length + 1) # Sprint "0" is everything before the first

def sprint_to_start_date(sprint_to_convert):
    return start_date_first_sprint + (sprint_to_convert - 1) * sprint_length

## Repos

In [None]:
server_repo = g.get_repo('corona-warn-app/cwa-app-android')
# Let's assume that all developers with less than 5 commits were not active developers
server_contributors = map(lambda stats: stats.author, filter(lambda stats: stats.total >= 5, server_repo.get_stats_contributors()))
server_team = list(filter(is_company_member, server_contributors))
server_team

# Example: Issue Arrival Analysis

## Data Collection and Preprocessing

In [None]:
app_repo = github.get_repo('corona-warn-app/cwa-app-android')

In [None]:
repo = app_repo

In [None]:
issues_raw = repo.get_issues(state='all')
issues_dataframe = pandas.DataFrame(issues_raw, columns=['issue'])
issues_dataframe

In [None]:
issues_opened_per_sprint = issues_dataframe.groupby(by=(lambda index: date_to_sprint(issues_dataframe.loc[index].issue.created_at)))
issues_opened_per_sprint

In [None]:
closed_issues = issues_dataframe[issues_dataframe.apply(lambda x: x.issue.closed_at is not None, axis=1)]
issues_closed_per_sprint = closed_issues.groupby(by=(lambda index: date_to_sprint(issues_dataframe.loc[index].issue.closed_at)))
issues_closed_per_sprint

In [None]:
issues = pandas.DataFrame(index=range(0, date_to_sprint(datetime.today())+1)) # The range ensures that there are rows for sprints where no issues were opened or closed
issues.index.name = 'sprint'
issues['opened'] = issues_opened_per_sprint.count().astype('Int64')
issues['closed'] = issues_closed_per_sprint.count().astype('Int64')
issues = issues.fillna(0) # Replace NaN values from outer join with zero
issues['open_at_start'] = (issues.opened - issues.closed).cumsum().shift(fill_value=0) # Issues open at start of each sprint = sum of issues opened - closed beforea
issues

## Analyze

In [None]:
start_sprint = date_to_sprint(datetime(2022, 1, 1)) # Using date here to be able to explore sprint length
end_sprint = date_to_sprint(datetime(2022, 10, 1))

In [None]:
issues_focus = issues[(issues.index >= start_sprint) & (issues.index <= end_sprint)]
issues_focus

In [None]:
figure, axes = pyplot.subplots()
pyplot.bar(issues_focus.index-0.2, issues_focus['opened'], width=0.4)
pyplot.bar(issues_focus.index+0.2, issues_focus['closed'], width=0.4)
axes.set_xlabel("Sprint")
axes.set_ylabel("No. Issues")
axes.set_xticks(issues_focus.index)
axes.legend(['opened', 'closed'])
print() # silence output from command before

In [None]:
figure, axes = pyplot.subplots()
pyplot.bar(issues_focus.index-0.2, issues_focus['opened'], width=0.4)
pyplot.bar(issues_focus.index+0.2, -issues_focus['closed'], bottom=issues_focus['opened'], width=0.4)
axes.set_xlabel("Sprint")
axes.set_ylabel("No. Issues")
axes.set_xticks(issues_focus.index)
axes.legend(['opened', 'closed'])
print() # silence output from command before

In [None]:
figure, axes = pyplot.subplots()
pyplot.bar(issues_focus.index, issues_focus['open_at_start'])
axes.set_xlabel("Sprint")
axes.set_ylabel("No. Issues")
axes.set_xticks(issues_focus.index)
axes.legend(['open_at_start'])
print() # silence output from command before

In [None]:
for sprint in range(55, 64+1):
    open = issues_dataframe[issues_dataframe.apply(lambda x: date_to_sprint(x.issue.created_at) < sprint and ((x.issue.closed_at is None) or (date_to_sprint(x.issue.closed_at) >= sprint)), axis=1)]
    open = open.apply(lambda x: [x.issue.number, x.issue.title], axis=1)
    print('Sprint: '+str(sprint)+': ')
    print(open.values)

In [None]:
figure, axes = pyplot.subplots()
pyplot.bar(issues_focus.index, issues_focus['opened'])
pyplot.bar(issues_focus.index, issues_focus['closed'], bottom=issues_focus['opened'])
axes.set_xticks(issues_focus.index)
axes.legend(['opened', 'closed'])
print() # silence output from command before

# Experiment Zone

In [None]:
list(map(lambda x: [x.event, x.label], server_repo.get_issue(1).get_events()))

In [None]:
events = list(server_repo.get_issue(1).get_timeline())
events

In [None]:
list(map(lambda x: [x.event, x.source, x.node_id], events))

In [None]:
figure, axes = pyplot.subplots()
c = ['a', 'b', 'a']
colors = numpy.linspace(0, 1, len(numpy.unique(c)))
colordict = dict(zip(c, colors))  
color_col = list(map(lambda x: colordict[x], c))
axes.scatter([1, 2, 3], [1, 2, 3], c=color_col)

In [None]:
events = []
for issue in issues_raw:
    if issue.created_at >= datetime(2022, 1, 1):
        events.extend(map(lambda event: (issue.number, event), issue.get_timeline()))
events[0:100]

In [None]:
create_events = map(lambda issue: ({'issue': issue.number, 'time': issue.created_at, 'event': 'created'}),filter(lambda issue: issue.created_at >= datetime(2022, 1, 1), issues_raw))
create_events

In [None]:
events_dataframe = pandas.DataFrame.from_records([[tup[0], tup[1].created_at, tup[1].event] for tup in events], columns=['issue', 'time', 'event'])
events_dataframe = events_dataframe.append(pandas.DataFrame.from_records(create_events, columns=['issue', 'time', 'event']))
events_dataframe

In [None]:
numpy.unique(events_dataframe.event)

In [None]:
event_whitelist = ['created', 'committed', 'review_requested', 'labeled', 'commented', 'cross-referenced', 'closed', 'ready_for_review', 'reviewed', 'milestoned', 'merged', 'assigned', 'referenced', 'mentioned']
events_filtered = events_dataframe[events_dataframe.apply(lambda x: x.event in event_whitelist, axis=1)]
palette = dict(zip(event_whitelist, seaborn.color_palette(n_colors=len(event_whitelist))))

In [None]:
pyplot.figure(figsize=(20, 10))
seaborn.scatterplot(data=events_filtered, x='time', y='issue', hue='event', palette=palette)

In [None]:
events_focus = events_filtered[events_filtered.apply(lambda x: x.time >= datetime(2022, 4, 15) and x.time <= datetime(2022, 5, 15),  axis=1)]
pyplot.figure(figsize=(20, 10))
seaborn.scatterplot(data=events_focus, x='time', y='issue', hue='event', palette=palette)