In [3]:
import os
import funcy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

sns.set()

from dev import DROPBOX_DIR

FIG_DIR = '/Users/Lraymond/Python/gitkit_research/data/rank_figs'

DATETIME_FMT = '%Y-%m-%d %H:%M:%S'

In [4]:
rank_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'developer_metrics_over_time.csv'))
del rank_df['Unnamed: 0']

In [5]:
# convert time to timevalue
rank_df.rank_time = rank_df.rank_time.apply(lambda x: pd.datetime.strptime(x, DATETIME_FMT).date())

In [6]:
rank_df[
    ['len_commiter_engagement', 'len_author_engagement', 'days_involved']] = rank_df[
        ['len_commiter_engagement', 'len_author_engagement', 'days_involved']].fillna(0)

In [7]:
# get date range of time series
date_range = sorted(rank_df.rank_time.unique())

In [8]:
# first, generate within variable correlation tables over time
rank_df_2017 = rank_df.loc[rank_df.rank_time.apply(lambda x: x.year) > 2016]

In [9]:
def plot_corr_heatmap(corr_df, varname):
    f, ax = plt.subplots(figsize=(11, 9))
    # generate upper triangular matrix
    mask = np.zeros_like(corr_df, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    # Draw the heatmap with the mask and correct aspect ratio
    pltmap = sns.heatmap(corr_df, cmap="YlGnBu", mask=mask, square=True, linewidths=.5)
    # set title on the axis
    ax.set_title('Developer {} Correlation over Time'.format(varname))
    return pltmap
    

In [10]:
def compute_variable_corr(df, varname, pdf_obj, excel_obj):
    print('working on ', varname)
    rank_pivot = pd.pivot_table(rank_df_2017, values=varname, columns='rank_time', index='developer_id')
    rank_corr = rank_pivot.corr()
    print(pd.isna(rank_corr).all().all())
    if pd.isna(rank_corr).all().all():
        # this means corr matrix total empty
        print('Unable to compute corrs for variable ', varname)
        return None
    # save corr to excel
    rank_corr.to_excel(excel_obj, sheet_name=varname[:31])
    # then generate corr heatmap and save also
    # Set up the matplotlib figure
    pltmap = plot_corr_heatmap(rank_corr, varname)
    pdf_obj.savefig(pltmap.figure)
    plt.close()

In [83]:
# compute correlation and heatmap for each variable with itself across time
# need to pass excel writer object and pdj object otherwise figs/graphs just overwritten
writer = pd.ExcelWriter(os.path.join(FIG_DIR, 'developer_within_metric_correlations_over_time.xlsx'))

with PdfPages(os.path.join(FIG_DIR, 'developer_within_metric_heatmaps_over_time.pdf')) as pdf:
    list(map(lambda x: compute_variable_corr(rank_df_2017, x, pdf, writer), rank_vars))
# must save to flush to file
writer.save()

working on  average_net_change
False
working on  average_pct_additions
False
working on  avg_file_age
False
working on  avg_num_distinct_authors_per_file
False
working on  avg_pct_deletions
False
working on  code_complexity_max
False
working on  days_involved
False
working on  len_author_engagement
False
working on  len_commiter_engagement
False
working on  mean_of_code_complexity_median
False
working on  more_than_50pct_additions
False
working on  net_code
False
working on  num_author_commits
False
working on  num_commiter_commits
False
working on  num_commits
False
working on  num_deletions
False
working on  num_new_files
False
working on  num_people_who_authored_commits_they_commited
False
working on  num_people_who_committer_their_commits
False
working on  num_renames
False
working on  num_total_diffs
False
working on  num_unique_files_changed
False
working on  number_different_files_changed
False
working on  pct_total_additions
False
working on  pct_total_deleted
False
working on 

In [11]:
# now, correlate each variable with other variables
raw_corr_vars = [
    'average_net_change',
 'average_pct_additions',
 'avg_file_age',
 'avg_num_distinct_authors_per_file',
 'avg_pct_deletions',

 'code_complexity_max',
 'days_involved',
 
 'len_author_engagement',
 'len_commiter_engagement',
 'mean_of_code_complexity_median',
 'more_than_50pct_additions',
 'net_code',
 'num_author_commits',
 'num_commiter_commits',
 'num_commits',
 'num_deletions',
 'num_new_files',
 'num_people_who_authored_commits_they_commited',
 'num_people_who_committer_their_commits',
 'num_renames',
 'num_total_diffs',
 'num_unique_files_changed',
 'number_different_files_changed',
 'pct_total_additions',
 'pct_total_deleted',
 'pct_total_lines', 
 'total_additions',
 'total_deletions',
 'total_files_more_than_50pct_deletions',
 'total_files_solo_authored',
 'total_files_with_last_change',
 'total_files_with_max_complexity'
]
 

In [12]:
rank_vars = [
    'average_net_change',
 'average_pct_additions',
 'avg_file_age',
 'avg_num_distinct_authors_per_file',
 'avg_pct_deletions',

 'code_complexity_max',
 'days_involved',
 
 'len_author_engagement',
 'len_commiter_engagement',
 'mean_of_code_complexity_median',
 'more_than_50pct_additions',
 'net_code',
 'num_author_commits',
 'num_commiter_commits',
 'num_commits',
 'num_deletions',
 'num_new_files',
 'num_people_who_authored_commits_they_commited',
 'num_people_who_committer_their_commits',
 'num_renames',
 'num_total_diffs',
 'num_unique_files_changed',
 'number_different_files_changed',
 'pct_total_additions',
 'pct_total_deleted',
 'pct_total_lines',
 
 'rank',
 'rank_average_net_change',
 'rank_average_pct_additions',
 'rank_avg_file_age',
 'rank_avg_num_distinct_authors_per_file',
 'rank_avg_pct_deletions',
 'rank_code_complexity_max',
 'rank_days_involved',
 'rank_mean_of_code_complexity_median',
 'rank_more_than_50pct_additions',
 'rank_net_code',
 'rank_num_author_commits',
 'rank_num_commiter_commits',
 'rank_num_commits',
 'rank_num_deletions',
 'rank_num_new_files',
 'rank_num_people_who_authored_commits_they_commited',
 'rank_num_people_who_committer_their_commits',
 'rank_num_renames',
 'rank_num_total_diffs',
 'rank_num_unique_files_changed',
 'rank_number_different_files_changed',
 'rank_pct_total_additions',
 'rank_pct_total_deleted',
 'rank_pct_total_lines',
 
 'rank_total_additions',
 'rank_total_deletions',
 'rank_total_files_more_than_50pct_deletions',
 'rank_total_files_solo_authored',
 'rank_total_files_with_last_change',
 'rank_total_files_with_max_complexity',
    
 'total_additions',
 'total_deletions',
 'total_files_more_than_50pct_deletions',
 'total_files_solo_authored',
 'total_files_with_last_change',
 'total_files_with_max_complexity'
]

In [13]:
rank_corr_vars = [
 'rank',
 'rank_average_net_change',
 'rank_average_pct_additions',
 'rank_avg_file_age',
 'rank_avg_num_distinct_authors_per_file',
 'rank_avg_pct_deletions',
 'rank_code_complexity_max',
 'rank_days_involved',
 'rank_mean_of_code_complexity_median',
 'rank_more_than_50pct_additions',
 'rank_net_code',
 'rank_num_author_commits',
 'rank_num_commiter_commits',
 'rank_num_commits',
 'rank_num_deletions',
 'rank_num_new_files',
 'rank_num_people_who_authored_commits_they_commited',
 'rank_num_people_who_committer_their_commits',
 'rank_num_renames',
 'rank_num_total_diffs',
 'rank_num_unique_files_changed',
 'rank_number_different_files_changed',
 'rank_pct_total_additions',
 'rank_pct_total_deleted',
 'rank_pct_total_lines']

In [86]:
writer = pd.ExcelWriter(os.path.join(FIG_DIR, 'developer_between_metric_correlations.xlsx'))

with PdfPages(os.path.join(FIG_DIR, 'developer_between_metric_heatmaps.pdf')) as pdf_obj:
    for group_vars, str_name in [(raw_corr_vars, 'raw metrics'), (rank_corr_vars, 'rank metrics'), (list(rank_corr_vars+raw_corr_vars), 'all metrics')]:
        rank_corr = rank_df_2017[group_vars].corr()
        rank_corr.to_excel(writer, sheet_name=str_name)
        # then generate corr heatmap and save also
        # Set up the matplotlib figure
        pltmap = plot_corr_heatmap(rank_corr, str_name)
        pdf_obj.savefig(pltmap.figure)
        plt.close()

# must save to flush to file
writer.save()

In [None]:
# last thing to do is plot skewness of variables (ie, plot raw values in a histogram for each of the metrics)

In [19]:
rank_df.rank_time.min()

datetime.date(2014, 6, 30)

In [20]:
rank_df.groupby('rank_time').agg({'developer_id': 'nunique'})

Unnamed: 0_level_0,developer_id
rank_time,Unnamed: 1_level_1
2014-06-30,1
2014-07-31,1
2014-08-31,2
2014-09-30,2
2014-10-31,2
2014-11-30,2
2014-12-31,2
2015-01-31,2
2015-02-28,2
2015-03-31,2


In [15]:
def graph_histogram(df, varname):
    mask = ((~pd.isnull(df[varname])) & (df.rank_time==df.rank_time.max()))
    distplot = sns.distplot(df.loc[mask, varname])
    distplot.set_title('{} Density'.format(varname))
    return distplot
    

In [16]:
def compute_hist(df, varname, pdf_obj):
    pltmap = graph_histogram(df, varname)
    pdf_obj.savefig(pltmap.figure)
    plt.close()

In [22]:
with PdfPages(os.path.join(FIG_DIR, 'developer_metric_density_plots.pdf')) as pdf:
    list(map(lambda x: compute_hist(rank_df, x, pdf), raw_corr_vars))


In [None]:
# Now need to automate correlation between ranks over time between 
# same measure, different times
# different measure, same times

# write code that runs automation process of generating stats for different points in time, output rankings
# then compute correlation within rankings over time (both raw numbers and ranks), then between overall rankings across time


# then look at distribution values of the histogram