In [23]:
import os
import funcy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

sns.set()

from dev import DROPBOX_DIR, FIG_DIR



DATETIME_FMT = '%Y-%m-%d %H:%M:%S'
DATE_FMT = '%Y-%m-%d'

In [2]:
rank_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'developer_metrics_over_time.csv'))
del rank_df['Unnamed: 0']

# convert time to timevalue
rank_df.rank_time = rank_df.rank_time.apply(lambda x: pd.datetime.strptime(x, DATETIME_FMT).date())

rank_df[
    ['len_commiter_engagement', 'len_author_engagement', 'days_involved']] = rank_df[
        ['len_commiter_engagement', 'len_author_engagement', 'days_involved']].fillna(0)

In [3]:
# get date range of time series
date_range = sorted(rank_df.rank_time.unique())

In [4]:
# first, generate within variable correlation tables over time
rank_df_2017 = rank_df.loc[rank_df.rank_time.apply(lambda x: x.year) > 2016]

In [5]:
def plot_corr_heatmap(corr_df, varname):
    f, ax = plt.subplots(figsize=(11, 9))
    # generate upper triangular matrix
    mask = np.zeros_like(corr_df, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    # Draw the heatmap with the mask and correct aspect ratio
    pltmap = sns.heatmap(corr_df, cmap="YlGnBu", mask=mask, square=True, linewidths=.5)
    # set title on the axis
    ax.set_title('Developer {} Correlation over Time'.format(varname))
    return pltmap
    

In [6]:
def compute_variable_corr(df, varname, pdf_obj, excel_obj):
    print('working on ', varname)
    rank_pivot = pd.pivot_table(rank_df_2017, values=varname, columns='rank_time', index='developer_id')
    rank_corr = rank_pivot.corr()
    print(pd.isna(rank_corr).all().all())
    if pd.isna(rank_corr).all().all():
        # this means corr matrix total empty
        print('Unable to compute corrs for variable ', varname)
        return None
    # save corr to excel
    rank_corr.to_excel(excel_obj, sheet_name=varname[:31])
    # then generate corr heatmap and save also
    # Set up the matplotlib figure
    pltmap = plot_corr_heatmap(rank_corr, varname)
    pdf_obj.savefig(pltmap.figure)
    plt.close()

In [8]:
# now, correlate each variable with other variables
raw_corr_vars = [
    'average_net_change',
 'average_pct_additions',
 'avg_file_age',
 'avg_num_distinct_authors_per_file',
 'avg_pct_deletions',

 'code_complexity_max',
 'days_involved',
 
 'len_author_engagement',
 'len_commiter_engagement',
 'mean_of_code_complexity_median',
 'more_than_50pct_additions',
 'net_code',
 'num_author_commits',
 'num_commiter_commits',
 'num_commits',
 'num_deletions',
 'num_new_files',
 'num_people_who_authored_commits_they_commited',
 'num_people_who_committer_their_commits',
 'num_renames',
 'num_total_diffs',
 'num_unique_files_changed',
 'number_different_files_changed',
 'pct_total_additions',
 'pct_total_deleted',
 'pct_total_lines', 
 'total_additions',
 'total_deletions',
 'total_files_more_than_50pct_deletions',
 'total_files_solo_authored',
 'total_files_with_last_change',
 'total_files_with_max_complexity'
]
 

In [9]:
rank_vars = [
    'average_net_change',
 'average_pct_additions',
 'avg_file_age',
 'avg_num_distinct_authors_per_file',
 'avg_pct_deletions',

 'code_complexity_max',
 'days_involved',
 
 'len_author_engagement',
 'len_commiter_engagement',
 'mean_of_code_complexity_median',
 'more_than_50pct_additions',
 'net_code',
 'num_author_commits',
 'num_commiter_commits',
 'num_commits',
 'num_deletions',
 'num_new_files',
 'num_people_who_authored_commits_they_commited',
 'num_people_who_committer_their_commits',
 'num_renames',
 'num_total_diffs',
 'num_unique_files_changed',
 'number_different_files_changed',
 'pct_total_additions',
 'pct_total_deleted',
 'pct_total_lines',
 
 'rank',
 'rank_average_net_change',
 'rank_average_pct_additions',
 'rank_avg_file_age',
 'rank_avg_num_distinct_authors_per_file',
 'rank_avg_pct_deletions',
 'rank_code_complexity_max',
 'rank_days_involved',
 'rank_mean_of_code_complexity_median',
 'rank_more_than_50pct_additions',
 'rank_net_code',
 'rank_num_author_commits',
 'rank_num_commiter_commits',
 'rank_num_commits',
 'rank_num_deletions',
 'rank_num_new_files',
 'rank_num_people_who_authored_commits_they_commited',
 'rank_num_people_who_committer_their_commits',
 'rank_num_renames',
 'rank_num_total_diffs',
 'rank_num_unique_files_changed',
 'rank_number_different_files_changed',
 'rank_pct_total_additions',
 'rank_pct_total_deleted',
 'rank_pct_total_lines',
 
 'rank_total_additions',
 'rank_total_deletions',
 'rank_total_files_more_than_50pct_deletions',
 'rank_total_files_solo_authored',
 'rank_total_files_with_last_change',
 'rank_total_files_with_max_complexity',
    
 'total_additions',
 'total_deletions',
 'total_files_more_than_50pct_deletions',
 'total_files_solo_authored',
 'total_files_with_last_change',
 'total_files_with_max_complexity'
]

In [12]:
rank_corr_vars = [
 'rank',
 'rank_average_net_change',
 'rank_average_pct_additions',
 'rank_avg_file_age',
 'rank_avg_num_distinct_authors_per_file',
 'rank_avg_pct_deletions',
 'rank_code_complexity_max',
 'rank_days_involved',
 'rank_mean_of_code_complexity_median',
 'rank_more_than_50pct_additions',
 'rank_net_code',
 'rank_num_author_commits',
 'rank_num_commiter_commits',
 'rank_num_commits',
 'rank_num_deletions',
 'rank_num_new_files',
 'rank_num_people_who_authored_commits_they_commited',
 'rank_num_people_who_committer_their_commits',
 'rank_num_renames',
 'rank_num_total_diffs',
 'rank_num_unique_files_changed',
 'rank_number_different_files_changed',
 'rank_pct_total_additions',
 'rank_pct_total_deleted',
 'rank_pct_total_lines']

In [10]:
# compute correlation and heatmap for each variable with itself across time
# need to pass excel writer object and pdj object otherwise figs/graphs just overwritten
writer = pd.ExcelWriter(os.path.join(FIG_DIR, 'developer_within_metric_correlations_over_time.xlsx'))

with PdfPages(os.path.join(FIG_DIR, 'developer_within_metric_heatmaps_over_time.pdf')) as pdf:
    list(map(lambda x: compute_variable_corr(rank_df_2017, x, pdf, writer), rank_vars))
# must save to flush to file
writer.save()

working on  average_net_change
False
working on  average_pct_additions
False
working on  avg_file_age
False
working on  avg_num_distinct_authors_per_file
False
working on  avg_pct_deletions
False
working on  code_complexity_max
False
working on  days_involved
False
working on  len_author_engagement
False
working on  len_commiter_engagement
False
working on  mean_of_code_complexity_median
False
working on  more_than_50pct_additions
False
working on  net_code
False
working on  num_author_commits
False
working on  num_commiter_commits
False
working on  num_commits
False
working on  num_deletions
False
working on  num_new_files
False
working on  num_people_who_authored_commits_they_commited
False
working on  num_people_who_committer_their_commits
False
working on  num_renames
False
working on  num_total_diffs
False
working on  num_unique_files_changed
False
working on  number_different_files_changed
False
working on  pct_total_additions
False
working on  pct_total_deleted
False
working on 

In [13]:
writer = pd.ExcelWriter(os.path.join(FIG_DIR, 'developer_between_metric_correlations.xlsx'))

with PdfPages(os.path.join(FIG_DIR, 'developer_between_metric_heatmaps.pdf')) as pdf_obj:
    for group_vars, str_name in [(raw_corr_vars, 'raw metrics'), (rank_corr_vars, 'rank metrics'), (list(rank_corr_vars+raw_corr_vars), 'all metrics')]:
        rank_corr = rank_df_2017[group_vars].corr()
        rank_corr.to_excel(writer, sheet_name=str_name)
        # then generate corr heatmap and save also
        # Set up the matplotlib figure
        pltmap = plot_corr_heatmap(rank_corr, str_name)
        pdf_obj.savefig(pltmap.figure)
        plt.close()

# must save to flush to file
writer.save()

In [14]:
# last thing to do is plot skewness of variables (ie, plot raw values in a histogram for each of the metrics)

In [15]:
def graph_histogram(df, varname):
    mask = ((~pd.isnull(df[varname])) & (df.rank_time==df.rank_time.max()))
    distplot = sns.distplot(df.loc[mask, varname])
    distplot.set_title('{} Density'.format(varname))
    return distplot
    

In [28]:
def compute_hist(df, varname, pdf_obj, graph_histogram_fnc=graph_histogram):
    pltmap = graph_histogram_fnc(df, varname)
    pdf_obj.savefig(pltmap.figure)
    plt.close()

In [17]:
with PdfPages(os.path.join(FIG_DIR, 'developer_metric_density_plots.pdf')) as pdf:
    list(map(lambda x: compute_hist(rank_df, x, pdf), raw_corr_vars))


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [43]:
# Now, import the montly delta stats and compute histograms
monthly_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'developer_metrics_monthly.csv'))
del monthly_df['Unnamed: 0']

In [140]:
monthly_rank_starts = monthly_df.sort_values(
    'rank_start').groupby('rank_start').agg({'rank': [min, max], 'developer_id': 'nunique'})

In [44]:
# convert time to timevalue
monthly_df.rank_start = monthly_df.rank_start.apply(lambda x: pd.datetime.strptime(x, DATE_FMT).date())
monthly_df.rank_end = monthly_df.rank_end.apply(lambda x: pd.datetime.strptime(x, DATE_FMT).date())

In [150]:
rank_df.columns

Index(['developer_id', 'people_who_committer_their_commits',
       'people_who_authored_commits_they_commited',
       'num_people_who_committer_their_commits',
       'num_people_who_authored_commits_they_commited',
       'first_author_engagement', 'last_author_engagement',
       'num_author_commits', 'num_author_diffs', 'len_author_engagement',
       'first_commiter_engagement', 'last_commiter_engagement',
       'num_commiter_commits', 'num_commiter_diffs', 'len_commiter_engagement',
       'total_additions', 'total_deletions', 'average_net_change',
       'mean_of_code_complexity_median', 'code_complexity_max',
       'num_unique_files_changed', 'num_total_diffs', 'num_commits',
       'num_renames', 'num_deletions', 'num_new_files', 'net_code',
       'repo_total_additions', 'repo_total_deletions', 'repo_total_renames',
       'repo_total_file_deletions', 'repo_total_file_creations',
       'repo_total_net_code', 'pct_total_additions', 'pct_total_deleted',
       'pct_total_li

In [154]:
# merge in final rank by overall data
final_ranks = rank_df.sort_values(
    ['developer_id', 'rank_time'], ascending=False).groupby('developer_id').agg(
        {'rank': funcy.first}).rename(columns={'rank': 'final_rank'})

In [29]:
def graph_monthly_histogram(df, varname):
    mask = (~pd.isnull(df[varname]))
    distplot = sns.distplot(df.loc[mask, varname])
    distplot.set_title('Monthly {} Density'.format(varname))
    return distplot

In [30]:
# create histograms of the relationships between monthly delta stats
raw_monthly_vars = [
    'total_additions', 'total_deletions', 'average_net_change', 'code_complexity_max', 'num_unique_files_changed',
    'num_renames', 'num_deletions', 'num_new_files', 'num_total_diffs','num_commits'
]

with PdfPages(os.path.join(FIG_DIR, 'monthly_developer_metric_density_plots.pdf')) as pdf:
    list(map(lambda x: compute_hist(monthly_df, x, pdf, graph_monthly_histogram), raw_monthly_vars))


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [155]:
# Now, the question is, do developers that end up with a high rank have a high rank in their first month of contribution

first_month = monthly_df.sort_values(
    ['developer_id', 'rank_start', 'rank_end']).groupby('developer_id').agg(
        {'rank_start': 'min', 'rank': funcy.first}).rename(
            columns={'rank_start': 'first_month_rank_start', 'rank': 'first_month_rank'})
first_month['first_month_flag'] = 1
last_month = monthly_df.sort_values(
    ['developer_id', 'rank_start', 'rank_end']).groupby('developer_id').agg(
        {'rank_start': 'max', 'rank': lambda x: x.values[-1]}).rename(
            columns={'rank_start': 'last_month_rank_start', 'rank': 'last_month_rank'})
last_month['last_month_flag'] = 1
first_month2 = first_month.merge(
    last_month, left_index=True, right_index=True).merge(final_ranks, left_index=True, right_index=True)

first_month2 = first_month2.sort_values('developer_id')

In [156]:
months2 = monthly_df.merge(first_month2,
                left_on='developer_id', right_index=True, how='left')

In [157]:
months2.loc[months2.rank_start!=months2.first_month_rank_start, 'first_month_flag'] = 0
months2.loc[months2.rank_start!=months2.last_month_rank_start, 'last_month_flag'] = 0

In [158]:
# add a category for number of months in the data set
months3 = months2.merge(
    months2.groupby(
        'developer_id').agg(
            {'rank_start': 'count'}).rename(
                columns={'rank_start': 'num_months_in_data'}), left_on='developer_id', right_index=True, 
                    how='left')

In [160]:
month_rank_only = months3.sort_values(
    ['developer_id', 'first_month_flag', 'last_month_rank', 'last_month_flag', 'num_months_in_data', 'final_rank']).drop_duplicates(
            ['developer_id', 'last_month_rank'], keep='last').set_index('developer_id')

month_rank_only[['first_month_rank', 'last_month_rank', 'final_rank']].corr()

Unnamed: 0,first_month_rank,last_month_rank,final_rank
first_month_rank,1.0,0.750398,0.454059
last_month_rank,0.750398,1.0,0.379795
final_rank,0.454059,0.379795,1.0


In [163]:
    mask = (month_rank_only.num_months_in_data > 3)
    diff = month_rank_only[mask].last_month_rank - month_rank_only[mask].first_month_rank

In [166]:
np.mean(diff)

1.0294117647058822

In [193]:
# average difference between first and last month's rank
avg_counts = []

for month_range in range(0, 6):
    mask = (month_rank_only.num_months_in_data > month_range)
    diff = np.median(month_rank_only[mask].last_month_rank - month_rank_only[mask].first_month_rank)
    diff_f = np.median(month_rank_only[mask].final_rank - month_rank_only[mask].first_month_rank)
    diff_l = np.median(month_rank_only[mask].final_rank - month_rank_only[mask].last_month_rank)
    count = month_rank_only[mask].shape[0]
    avg_counts.append((month_range, count, diff, diff_f, diff_l))

In [194]:
first_last_ranks_diffs = pd.DataFrame(
    avg_counts, columns=[
        'month_min', 'count_devs', 'med_diff_last_first_ranks', 'med_diff_final_first',
        'med_diff_final_last']).set_index('month_min')

In [199]:
first_last_ranks_diffs

Unnamed: 0_level_0,count_devs,med_diff_last_first_ranks,med_diff_final_first,med_diff_final_last
month_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,85,0.0,40.0,37.0
1,33,1.0,12.0,12.0
2,24,1.25,10.0,9.5
3,17,1.0,6.5,6.0
4,8,1.75,2.5,1.5
5,5,0.0,0.0,1.0


In [201]:
avg_counts2 = []

for min_rank in range(3, 14):
    mask = (month_rank_only.final_rank <= min_rank)
    diff = np.median(month_rank_only[mask].last_month_rank - month_rank_only[mask].first_month_rank)
    diff_f = np.median(month_rank_only[mask].final_rank - month_rank_only[mask].first_month_rank)
    diff_l = np.median(month_rank_only[mask].final_rank - month_rank_only[mask].last_month_rank)
    avg_months = np.mean(month_rank_only[mask].num_months_in_data)
    count = month_rank_only[mask].shape[0]
    avg_counts2.append((min_rank, count, avg_months, diff, diff_f, diff_l))

In [202]:
first_last_ranks_top_devs = pd.DataFrame(
    avg_counts2, columns=['top_x_devs', 'count_devs', 'avg_months_data',
            'med_diff_last_first_ranks', 'med_diff_final_first',
        'med_diff_final_last']).set_index('top_x_devs')

In [203]:
first_last_ranks_top_devs

Unnamed: 0_level_0,count_devs,avg_months_data,med_diff_last_first_ranks,med_diff_final_first,med_diff_final_last
top_x_devs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,3,19.666667,1.5,-2.0,-5.0
4,4,16.0,2.25,-1.0,-2.5
5,5,15.0,1.5,0.0,0.0
6,6,13.166667,1.25,1.0,0.5
7,6,13.166667,1.25,1.0,0.5
8,8,10.875,1.25,2.25,1.5
9,9,9.888889,1.0,2.5,2.0
10,10,9.2,0.5,2.25,2.25
11,10,9.2,0.5,2.25,2.25
12,11,8.636364,0.0,2.5,2.5
