In [1]:
import funcy
import numpy as np
import pandas as pd
import os
import datetime


from dev import (LOCAL_DB, DATA_DIR, DROPBOX_DIR, DEV_COLLAB_FNAME, DEV_CHANGES_FNAME, 
                         DEV_CONTR_BY_FILE_FNAME, DEV_RANKING_FNAME, RAW_FNAME)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

DATETIME_FMT = '%Y-%m-%d %H:%M:%S'

In [2]:
# read in developer collab file with number/who people they worked with, first and last engagement
# then add in total number of files edit, total only they worked on, total authored and commited

# get average age of files they commited
# total lines of code added/deleted
# average code complexity/percentile relative to that file
# do these all in percentiles as well and then rank by 

In [7]:
df3 = pd.read_csv(os.path.join(DROPBOX_DIR, RAW_FNAME), DEV_Co)
del df3['Unnamed: 0']

In [14]:
# convert timestamp to a time value
df3['timestamp'] = df3.timestamp.apply(lambda x: pd.datetime.strptime(x, DATETIME_FMT))


In [29]:
def get_file_dev_stats(df, as_of_date):
    # as of date should be a datetime object
    # get number of additions/deletions to each file as of a certian date
    # this creates what was previously known of as the FILE_STATS_FNAME file
    time_mask = df.timestamp < pd.datetime.strptime(as_of_date, '%Y-%m-%d')
    # subset dataframe into everything prior to date
    df3 = df[time_mask]
    # this will be merged into first_dev data set that tells us about the first author and commiter for each file
    # as well as number of unique developers, commiters and lines added per file
    file_dev = df3.sort_values(
        ['canonical_name', 'author_id', 'timestamp']).groupby(['canonical_name']).agg({
        'author_id': 'nunique', 'commiter_id': 'nunique', 
        'additions': np.sum, 'deletions': np.sum,  'net_change': np.sum, 'code_complexity_max': np.max, 
        'timestamp': np.max}
            ).rename(columns={
            'author_id': 'distinct_authors','commiter_id': 'num_unique_commiters',
            'additions': 'total_additions', 'deletions': 'total_deletions',
            'net_change': 'total_code', 'code_complexity_max': 'file_level_code_complexity_max', 
            'timestamp': 'time_last_change'})
    
    # then, for each author on the file, count their contribution to the file
    file_dev_lines = df3.sort_values(
            ['canonical_name', 'author_id', 'timestamp']).groupby(['canonical_name', 'author_id']).agg({
            'additions': np.sum, 'deletions': np.sum,  'net_change': np.sum, 'code_complexity_max': np.max,
            'timestamp': np.max}).rename(columns={
            'timestamp': 'time_author_last_change',
    })
    # merge file level stats with file-author level stats
    file_stats2 = file_dev_lines.merge(right=file_dev, left_index=True, right_index=True, how='outer')
    print('file level stats shape ', file_dev.shape)
    print('file author level stats shape ', file_dev_lines.shape)
    print('merged file shape ', file_stats2.shape)
    
    # count files solo authored by each developer
    file_stats2['solo_authored'] = 0
    file_stats2.loc[file_stats2.distinct_authors==1, 'solo_authored'] = 1

    # count files where author has authored the max complexity
    file_stats2['authored_max_complexity'] = 0
    file_stats2.loc[((file_stats2.code_complexity_max==file_stats2.file_level_code_complexity_max) & (
        file_stats2.file_level_code_complexity_max > 0)), 'authored_max_complexity'] = 1

    # calculate percent of additions each person has contributed, fill missing with 0s
    file_stats2['pct_additions'] = file_stats2['additions']*100.0/file_stats2['total_additions']
    file_stats2['pct_deletions'] = file_stats2['deletions']*100.0/file_stats2['total_deletions']
    file_stats2[['pct_additions', 'pct_deletions']] = file_stats2[['pct_additions', 'pct_deletions']].fillna(0)

    # flag for more than 50% additions/deletions
    file_stats2['more_than_50pct_additions'] = file_stats2.pct_additions.apply(lambda x: 1 if x > 50 else 0)
    file_stats2['more_than_50pct_deletions'] = file_stats2.pct_deletions.apply(lambda x: 1 if x > 50 else 0)
    
    # calculate if person was the last to change a file
    file_stats2['last_change'] = file_stats2.apply(
        lambda x: 1 if x.time_author_last_change==x.time_last_change else 0, axis=1)
    # calculate age of the file since last change
    file_stats2['file_age'] = file_stats2['time_last_change'].apply(lambda x: (pd.datetime.now() - x).days)
    return file_stats2


In [30]:
dev_file = get_file_dev_stats(df3, '2018-07-01')

file level stats shape  (930, 7)
file author level stats shape  (1942, 5)
merged file shape  (1942, 12)


In [None]:
dev_collab = pd.read_csv(os.path.join(DROPBOX_DIR, DEV_COLLAB_FNAME))

In [None]:
dev_changes = pd.read_csv(os.path.join(DROPBOX_DIR, DEV_CHANGES_FNAME))

In [None]:
# take total number of solo authored, files with max complexity, more than 50pct additions,
# more than 50 pct deletions, last change average pct additions, 
# avg pct deletions, file age, advg distinct authors
dev_file2 = dev_file.sort_values(['author_id', 'canonical_name']).groupby('author_id').agg({
    'canonical_name': 'nunique', 'distinct_authors': np.mean, 'pct_additions': np.mean,
    'pct_deletions': np.mean, 
    'file_age': np.mean,
    'solo_authored': np.sum, 'authored_max_complexity': np.sum, 'more_than_50pct_additions': np.sum,
    'more_than_50pct_deletions': np.sum, 'last_change': np.sum, 
}).rename(columns={'canonical_name': 'number_different_files_changed', 'distinct_authors': 'avg_num_distinct_authors_per_file',
                  'pct_additions': 'average_pct_additions', 'pct_deletions': 'avg_pct_deletions', 'file_age': 'avg_file_age',
                  'solo_authored': 'total_files_solo_authored', 'authored_max_complexity': 'total_files_with_max_complexity',
                  'more_than_50pct_additions': 'total_files_more_than_50pct_additions', 
                  'more_than_50pct_deletions': 'total_files_more_than_50pct_deletions', 'last_change': 'total_files_with_last_change'})

In [None]:
# only keep developer leve info (the cum_total_<> vars are for repo at this time)
dev_max = dev_changes.sort_values(['author_id', 'timestamp']).groupby('author_id').agg(
    {'author_id': 'count', 'total_additions': np.sum, 'total_deletions': np.sum,
       'average_net_change': np.mean, 'mean_of_code_complexity_median': np.mean, 'code_complexity_max': np.mean,
       'num_unique_files_changed': np.mean, 'num_edit_locations': np.mean, 'num_renames': np.sum,
       'num_deletions': np.sum, 'num_new_files':np.sum, 'net_code': np.mean})

In [None]:
dev_max2 = dev_max.rename(columns={
    'author_id': 'total_diffs', 'num_unique_files_changed': 'avg_num_unique_files_changed',
    'code_complexity_max': 'average_max_code_complexity',
    'num_edit_locations': 'avg_num_edit_locations', 'net_code':'avg_net_code_per_diff'})

In [None]:
dev2 = dev_max2.merge(right=dev_collab.set_index('author_id'), left_index=True, right_index=True, how='outer')

In [None]:
dev2

In [None]:
dev2['num_other_commiters'] = dev2.people_who_committer_their_commits.apply(lambda x: len(x))
dev2['num_other_authors'] = dev2.people_who_authored_commits_they_commited.apply(lambda x: len(x))

dev2[['first_author_engagement', 'last_author_engagement', 'first_commiter_engagement',
       'last_commiter_engagement']] = dev2[['first_author_engagement', 'last_author_engagement', 'first_commiter_engagement',
       'last_commiter_engagement']].applymap(lambda x: np.NAN if pd.isnull(x) else pd.datetime.strptime(x, DATETIME_FMT))

dev2['len_author_engagement'] = dev2['last_author_engagement'] - dev2['first_author_engagement']
dev2['len_commiter_engagement'] = dev2['last_commiter_engagement'] - dev2['first_commiter_engagement']

dev2[['total_diffs', 'total_additions', 'total_deletions',
       'average_net_change', 'mean_of_code_complexity_median',
       'avg_num_unique_files_changed', 'avg_num_edit_locations', 'num_renames',
       'num_deletions', 'num_new_files', 'avg_net_code_per_diff',    
       'num_authored_commits',  'num_commiter_commits',
        'num_other_commiters', 'num_other_authors']] = dev2[['total_diffs', 'total_additions', 'total_deletions',
       'average_net_change', 'mean_of_code_complexity_median',
       'avg_num_unique_files_changed', 'avg_num_edit_locations', 'num_renames',
       'num_deletions', 'num_new_files', 'avg_net_code_per_diff',    
       'num_authored_commits',  'num_commiter_commits',
        'num_other_commiters', 'num_other_authors']].fillna(0)


In [None]:
def get_first_engagement(x):
    comm = x['first_commiter_engagement']
    auth = x['first_author_engagement']
    if pd.isnull(comm):
        return 'auth'
    if pd.isnull(auth) or comm < auth:
        return 'comm'
    return 'auth'
  

def get_last_engagement(x):
    comm = x['last_commiter_engagement']
    auth = x['last_author_engagement']
    if pd.isnull(comm):
        return 'auth'
    if pd.isnull(auth) or comm > auth:
        return 'comm'
    return 'auth'


def get_time_engaged(x):
    zero = datetime.datetime.strptime('2000-01-01 00:00:00',DATETIME_FMT )
    first = np.nanmin(x[['first_commiter_engagement', 'first_author_engagement']].fillna(pd.datetime.now()))
    last = np.nanmax(x[['last_commiter_engagement', 'last_author_engagement']].fillna(zero))
    if pd.isnull(first) and pd.isnull(last):
        return np.nan
    diff = last-first
    return diff.astype('timedelta64[D]')
    print(diff)
    return diff.days

In [None]:
dev3 = dev2.merge(right=dev_file2, left_index=True, right_index=True, how='left')

In [None]:
# dev4['rank'] = dev4['avg_rank'].rank()
dev3['type_first_engagement'] = dev3.apply(get_first_engagement, axis=1)
dev3['type_last_engagement'] = dev3.apply(get_last_engagement, axis=1)
dev3['days_involved'] = dev3.apply(get_time_engaged, axis=1)

In [None]:
print(dev_max2.shape)
print(dev_collab.shape)
print(dev2.shape)
print(dev_file2.shape)
print(dev3.shape)

In [None]:
to_rank_cols = [c for c in dev3.columns if not c.endswith('_engagement') and not c.startswith('people_') and c not in ('email', 'name')]

In [None]:
# note that we use ascending = False becase we want people iwth largest number of things to get higher ranks
ranks = dev3[to_rank_cols].apply(lambda x: x.rank(ascending=False), axis=0)

In [None]:
ranks.columns = ['rank_{}'.format(x) for x in ranks.columns]

In [None]:
dev4 = dev3.merge(ranks, left_index=True, right_index=True, how='outer')

In [None]:
# axis=1 apply function to each row
dev4['avg_rank'] = dev4[[c for c in dev4.columns if c.startswith('rank_')]].apply(np.mean, axis=1)
dev4['rank'] = dev4.avg_rank.rank()

In [None]:
dev4.sort_values('rank', ascending=True).to_csv(os.path.join(DROPBOX_DIR, DEV_RANKING_FNAME))

In [None]:
dev4.columns

## Developer Ranking by Code Contribution

* rank column - this is a developer's overall rank by contributions to the specific repository
* todal diffs - total number of diffs committed to the repo
* total_additions - total number of lines added to the code base
* total_deletions - total lines deleted from the code base
* average_net_change - average net change by lines of code in their contributions
* mean_of_code_complexity_median - average median code complexity of lines added
* average_max_code_complexity - average (per diff) of the max code complexity of lines added
* avg_num_unique_files_changed - average unique files changed per diff
* avg_num_edit_locations - average edit locations per diff. Can be interpreted as number of unique code changes per diff
* num_renames - number of files renamed
* num_deletions - number of files deleted
* num_new_files - number of files created
* avg_net_code_per_diff - average additions-deletions of code per diff
* first_author_engagement - date of first engagement as a commit author with repo
* last_author_engagement - date of last engagement as a commit author with repo
* num_authored_commits - number of commits authored 
* first_commiter_engagement - date of first engagement as a committer with repo
* last_commiter_engagement - date of last engagement as a committer with repo
* num_commiter_commits - number of commits committed
* people_who_committer_their_commits' - list of author ids of distinct people who committed commits they authored
* people_who_authored_commits_they_commited' - ids of distinct people who authored commits they commited
* email - github account email
* name - github account name
* num_other_commiters - number of people who commited things they authored
* num_other_authors - number of people who authored commits they commited
* len_author_engagement - time in days between first and last engagement as commit author
* len_commiter_engagement - length of time in days of engagement as a commiter
* number_different_files_changed - total number of different files changed
* avg_num_distinct_authors_per_file - for files they changed, average number of developers contributing to that file
* average_pct_additions - for files they have worked on, average percent of total lines they added
* avg_pct_deletions - for files they have worked on, average percent of total lines they deleted
* avg_file_age - average days since file they worked on was last changed
* total_files_solo_authored - total number of files they are solo author of
* total_files_with_max_complexity - total number of files they contributed code equal to the max complexity of the file
* total_files_more_than_50pct_additions - total number of files where they contributed more than 50% of the code
* total_files_more_than_50pct_deletions- total number of files where they deleted more than 50% of the code
* total_files_with_last_change - total number of files where their change was last one
* type_first_engagement - first type of engagement (as author or commiter)
* type_last_engagement - last type of engagement (as author or commiter)
* time_involved - days between first/last engagement of any type
       
  
  
After computing these numerical stats, I rank each developer on the stat, then I take an equal weighted average of the ranks for the "avg_rank") and then rank developers on their average rank to figure out who has the best constribution over time. 