In [300]:
import funcy
import numpy as np
import pandas as pd
import collections
import funcy
import os
import datetime


from dev import (LOCAL_DB, DATA_DIR, DROPBOX_DIR, DEV_COLLAB_FNAME, DEV_CHANGES_FNAME, 
                         DEV_CONTR_BY_FILE_FNAME, DEV_RANKING_FNAME, RAW_FNAME)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

DATETIME_FMT = '%Y-%m-%d %H:%M:%S'
EXPORT_CSV=True

In [286]:
# read in developer collab file with number/who people they worked with, first and last engagement
# then add in total number of files edit, total only they worked on, total authored and commited

# get average age of files they commited
# total lines of code added/deleted
# average code complexity/percentile relative to that file
# do these all in percentiles as well and then rank by 

In [343]:
df3 = pd.read_csv(os.path.join(DROPBOX_DIR, RAW_FNAME), )
del df3['Unnamed: 0']

In [344]:
# convert timestamp to a time value
df3['timestamp'] = df3.timestamp.apply(lambda x: pd.datetime.strptime(x, DATETIME_FMT))


In [345]:
df3.columns

Index(['additions', 'commit_id', 'deletions', 'filename_new', 'filename_old',
       'filetype', 'diff_id', 'is_deletion', 'is_new', 'is_rename', 'raw_diff',
       'author_id', 'commit_body', 'commiter_id', 'date_time', 'git_hash',
       'repo_id', 'sha1', 'subject', 'timestamp', 'author_email',
       'commiter_email', 'canonical_name', 'code_complexity',
       'code_complexity_max', 'code_complexity_min', 'code_complexity_mean',
       'code_complexity_median', 'net_change'],
      dtype='object')

In [422]:
def get_min_max_engagement(df, type_id):
    # if want first and last commit engagement, type id should be commiter id
    type_name = type_id.split('_')[0]
    comm_time = df.sort_values(
        [type_id, 'timestamp']).groupby([type_id]).agg({'timestamp': [np.min, np.max],
                                                               type_id : 'count'})
    comm_time.columns = comm_time.columns.get_level_values(1)
    comm_time2 = comm_time.rename(columns={
        'amin': 'first_{}_engagement'.format(type_name),
            'amax':'last_{}_engagement'.format(type_name),
                'count': 'num_{}_commits'.format(type_name)})
    comm_time2['len_{}_engagement'.format(type_name)] = comm_time2[
        'last_{}_engagement'.format(type_name)] - comm_time2['first_{}_engagement'.format(type_name)]

    return comm_time2

In [423]:
def get_author_comm_collabs(df3):
    # collaborations stats
    # for each author, I want to keep track of who commits their diffs
    # who else works on the same files as them
    author_comm_pairs = df3[['author_id', 'author_email', 'commiter_id', 'commiter_email', 'timestamp']].values

    # filter for pairs that are different
    collabs = list(filter(lambda x: x[0]!=x[2], author_comm_pairs))

    # for each author, find number of commits
    collab_df = pd.DataFrame(
        collabs, columns=['author_id', 'author_email', 'commiter_id', 'commiter_email', 'timestamp']).drop_duplicates()

    collab2 = collab_df[['author_id', 'commiter_id']].sort_values('author_id').drop_duplicates().values
    
    d = collections.defaultdict(list)
    c = collections.defaultdict(list)
    for auth_id, collab in collab2:
        d[auth_id].append(collab)
        c[collab].append(auth_id)
    l = list(d.keys())
    l.extend(c.keys())
    
    # turn into a set to remove duplicate ids
    all_ids = np.array(list(set(l)))
    dev = pd.DataFrame(index=all_ids, columns=[
        'people_who_committer_their_commits', 'people_who_authored_commits_they_commited'])
    dev.index.name = 'author_id'
    dev['people_who_committer_their_commits'] = dev.index.map(lambda x: d[x])
    dev['people_who_authored_commits_they_commited'] = dev.index.map(lambda x: c[x])
    # add number of people who they co-authored/commited with
    dev['num_people_who_committer_their_commits'] = dev['people_who_committer_their_commits'].apply(lambda x: len(x))
    dev['num_people_who_authored_commits_they_commited'] = dev['people_who_authored_commits_they_commited'].apply(lambda x: len(x))
    dev['num_people_who_committer_their_commits'] = dev['num_people_who_committer_their_commits'].fillna(0)
    dev['num_people_who_authored_commits_they_commited'] = dev['num_people_who_authored_commits_they_commited'].fillna(0)
    return dev

In [424]:
def get_dev_stats(df3):
    # previously generated the DEV_CHANGES_FNAME
    
    # used to have number of edit locations, but realized it was not being correctly reported (always equal to num diffs)
    # first, aggregate number of additions, deletions, deletions, renames by author
    auths = df3.sort_values(['author_id', 'timestamp']).groupby(['author_id']).agg({
    'additions': np.sum, 'deletions': np.sum, 'net_change': np.mean,
    'code_complexity_median': np.mean, 'code_complexity_max': np.max,
    'filename_old': 'nunique',  'diff_id': 'nunique', 'commit_id': 'nunique', 
    'is_rename': np.sum, 'is_deletion': np.sum, 'is_new': np.sum}).rename(columns={
        'additions': 'total_additions', 
        'deletions': 'total_deletions', 'net_change': 'average_net_change',
        'code_complexity_median':'mean_of_code_complexity_median', 
        'filename_old': 'num_unique_files_changed', 
        'is_rename': 'num_renames', 'is_deletion': 'num_deletions',
        'is_new': 'num_new_files', 'commit_id': 'num_commits', 'diff_id': 'num_total_diffs'})
    auths['net_code'] = auths['total_additions'] - auths['total_deletions']
    
    # gets repo totals of lines, deletions, renames etx
    repo_totals = df3[['additions', 'deletions', 'is_rename', 'is_deletion', 'is_new']].sum(axis=0)
    repo_totals['repo_total_net_code'] = repo_totals.additions - repo_totals.deletions
    auths['repo_total_additions'] = repo_totals['additions']
    auths['repo_total_deletions'] = repo_totals['deletions']
    auths['repo_total_renames'] = repo_totals['is_rename']
    auths['repo_total_file_deletions'] = repo_totals['is_deletion']
    auths['repo_total_file_creations'] = repo_totals['is_new']
    auths['repo_total_net_code'] = repo_totals['repo_total_net_code']

    # calculate percentage of new code added
    
    auths['pct_total_additions'] = auths['total_additions']/auths['repo_total_additions']
    auths['pct_total_deleted'] = auths['total_deletions']/auths['repo_total_deletions']
    auths['pct_total_lines'] = auths['net_code']/auths['repo_total_net_code']
    return auths

    

In [425]:
def get_file_dev_stats(df3):
    # get number of additions/deletions to each file as of a certian date
    # this creates what was previously known of as the FILE_STATS_FNAME file

    # this will be merged into first_dev data set that tells us about the first author and commiter for each file
    # as well as number of unique developers, commiters and lines added per file
    file_dev = df3.sort_values(
        ['canonical_name', 'author_id', 'timestamp']).groupby(['canonical_name']).agg({
        'author_id': 'nunique', 'commiter_id': 'nunique', 
        'additions': np.sum, 'deletions': np.sum,  'net_change': np.sum, 'code_complexity_max': np.max, 
        'timestamp': np.max}
            ).rename(columns={
            'author_id': 'distinct_authors','commiter_id': 'num_unique_commiters',
            'additions': 'total_additions', 'deletions': 'total_deletions',
            'net_change': 'total_code', 'code_complexity_max': 'file_level_code_complexity_max', 
            'timestamp': 'time_last_change'})
    
    # then, for each author on the file, count their contribution to the file
    file_dev_lines = df3.sort_values(
            ['canonical_name', 'author_id', 'timestamp']).groupby(['canonical_name', 'author_id']).agg({
            'additions': np.sum, 'deletions': np.sum,  'net_change': np.sum, 'code_complexity_max': np.max,
            'timestamp': np.max}).rename(columns={
            'timestamp': 'time_author_last_change',
    })
    # merge file level stats with file-author level stats
    file_stats2 = file_dev_lines.merge(right=file_dev, left_index=True, right_index=True, how='outer')
    print('file level stats shape ', file_dev.shape)
    print('file author level stats shape ', file_dev_lines.shape)
    print('merged file shape ', file_stats2.shape)
    
    # count files solo authored by each developer
    file_stats2['solo_authored'] = 0
    file_stats2.loc[file_stats2.distinct_authors==1, 'solo_authored'] = 1

    # count files where author has authored the max complexity
    file_stats2['authored_max_complexity'] = 0
    file_stats2.loc[((file_stats2.code_complexity_max==file_stats2.file_level_code_complexity_max) & (
        file_stats2.file_level_code_complexity_max > 0)), 'authored_max_complexity'] = 1

    # calculate percent of additions each person has contributed, fill missing with 0s
    file_stats2['pct_additions'] = file_stats2['additions']*100.0/file_stats2['total_additions']
    file_stats2['pct_deletions'] = file_stats2['deletions']*100.0/file_stats2['total_deletions']
    file_stats2[['pct_additions', 'pct_deletions']] = file_stats2[['pct_additions', 'pct_deletions']].fillna(0)

    # flag for more than 50% additions/deletions
    file_stats2['more_than_50pct_additions'] = file_stats2.pct_additions.apply(lambda x: 1 if x > 50 else 0)
    file_stats2['more_than_50pct_deletions'] = file_stats2.pct_deletions.apply(lambda x: 1 if x > 50 else 0)
    
    # calculate if person was the last to change a file
    file_stats2['last_change'] = file_stats2.apply(
        lambda x: 1 if x.time_author_last_change==x.time_last_change else 0, axis=1)
    # calculate age of the file since last change
    file_stats2['file_age'] = file_stats2['time_last_change'].apply(lambda x: (pd.datetime.now() - x).days)
    return file_stats2


In [426]:
def get_first_engagement(x):
    comm = x['first_commiter_engagement']
    auth = x['first_author_engagement']
    if pd.isnull(comm):
        return 'auth'
    if pd.isnull(auth) or comm < auth:
        return 'comm'
    return 'auth'
  

def get_last_engagement(x):
    comm = x['last_commiter_engagement']
    auth = x['last_author_engagement']
    if pd.isnull(comm):
        return 'auth'
    if pd.isnull(auth) or comm > auth:
        return 'comm'
    return 'auth'


def get_time_engaged(x):
    zero = datetime.datetime.strptime('2000-01-01 00:00:00',DATETIME_FMT )
    first = np.nanmin(x[['first_commiter_engagement', 'first_author_engagement']].fillna(pd.datetime.now()))
    last = np.nanmax(x[['last_commiter_engagement', 'last_author_engagement']].fillna(zero))
    if pd.isnull(first) and pd.isnull(last):
        return np.nan
    diff = last-first
    return diff.astype('timedelta64[D]')
    print(diff)
    return diff.days

In [427]:
def get_rank_files(df, as_of_date):
    if not isinstance(as_of_date, pd.Timestamp):
         as_of_date = pd.datetime.strptime(as_of_date, '%Y-%m-%d')
    time_mask = df.timestamp < as_of_date
    df_short = df[time_mask]
    collab = get_author_comm_collabs(df_short)

    auth_time = get_min_max_engagement(df_short, 'author_id')
    # not that commit_id is id for each commit, so make sure you don't use that one
    comm_time = get_min_max_engagement(df_short, 'commiter_id')

    collab2 = collab.merge(right=auth_time, left_index=True, right_index=True, how='outer')
    collab3 = collab2.merge(right=comm_time, left_index=True, right_index=True, how='outer')

    auths = get_dev_stats(df_short)
    dev_file = get_file_dev_stats(df_short)
    
    # for each developer count total number of solo authored, files with max complexity, more than 50pct additions,
    # more than 50 pct deletions, last change average pct additions, 
    # avg pct deletions, file age, advg distinct authors
    dev_file2 = dev_file.reset_index().sort_values(['author_id', 'canonical_name']).groupby('author_id').agg({
        'canonical_name': 'nunique', 'distinct_authors': np.mean, 'pct_additions': np.mean,
        'pct_deletions': np.mean, 
        'file_age': np.mean,
        'solo_authored': np.sum, 'authored_max_complexity': np.sum, 'more_than_50pct_additions': np.sum,
        'more_than_50pct_deletions': np.sum, 'last_change': np.sum, 
        }).rename(columns={
        'canonical_name': 'number_different_files_changed',
        'distinct_authors': 'avg_num_distinct_authors_per_file',
        'pct_additions': 'average_pct_additions', 'pct_deletions': 'avg_pct_deletions',
        'file_age': 'avg_file_age',
        'solo_authored': 'total_files_solo_authored',
        'authored_max_complexity': 'total_files_with_max_complexity',
        'more_than_50pct_aMdditions': 'total_files_more_than_50pct_additions', 
        'more_than_50pct_deletions': 'total_files_more_than_50pct_deletions', 
        'last_change': 'total_files_with_last_change'
    })
 
    collab4 = collab3.merge(right=auths, left_index=True, right_index=True, how='outer')
    collab5 = collab4.merge(right=dev_file2, left_index=True, right_index=True, how='outer')
    print(collab3.shape)
    print(auths.shape)
    print(collab4.shape)
    print(dev_file2.shape)
    print(collab5.shape)
    collab5['type_first_engagement'] = collab5.apply(get_first_engagement, axis=1)
    collab5['type_last_engagement'] = collab5.apply(get_last_engagement, axis=1)
    collab5['days_involved'] = collab5.apply(get_time_engaged, axis=1)
    
    # merge in author ids and emails
    unique_devs = pd.concat([df_short[[
    'author_email', 'author_id']].rename(
        columns={'author_email': 'email', 'author_id': 'developer_id'}),
           df_short[['commiter_email', 'commiter_id']].rename(
        columns={'commiter_email': 'email', 'commiter_id': 'developer_id'})]).drop_duplicates().set_index(
        'developer_id')
   
    c6 = collab5.merge(right=unique_devs, left_index=True, right_index=True, how='left')
    c6.index.name = 'developer_id'
    c6[
    ['len_commiter_engagement', 'len_author_engagement', 'days_involved']] = c6[
        ['len_commiter_engagement', 'len_author_engagement', 'days_involved']].fillna(0)
    
    split_days = lambda x: x.days
    c6.days_involved = c6.days_involved.apply(split_days)
    c6.len_author_engagement = c6.len_author_engagement.apply(split_days)
    c6.len_commiter_engagement = c6.len_commiter_engagement.apply(split_days)

    print(c6.shape)
    return c6



In [428]:
def rank_metrics(df, cols_to_rank=None):
    if not cols_to_rank:
        cols_to_rank = [
            c for c in df.columns if not c.endswith(
                '_engagement') and not c.startswith('people_') and not c.startswith('repo_') and c not in ('email', 'name', 'developer_id', 'author_id')]
    print('ranking the following columns ', cols_to_rank)
    # note that we use ascending = False becase we want people iwth largest number of things to get higher ranks
    ranks = df[cols_to_rank].apply(lambda x: x.rank(ascending=False), axis=0)
    ranks.columns = ['rank_{}'.format(x) for x in ranks.columns]
    # axis=1 apply function to each row
    ranks['avg_rank'] = ranks[[c for c in ranks.columns if c.startswith('rank_')]].apply(np.mean, axis=1)
    ranks['rank'] = ranks.avg_rank.rank()
    return ranks.sort_values('rank', ascending=True)
    

In [429]:
def compute_ranks(df, as_of_date, cols_to_rank=None):
    df1 = get_rank_files(df, as_of_date)
    ranks = rank_metrics(df1, cols_to_rank)
    df2 = df1.merge(right=ranks, left_index=True, right_index=True)
    df2['rank_time'] = as_of_date
    df3 = df2.reset_index()
    print(df3.shape)
    return df3.sort_values('rank')

In [430]:
date_range = pd.date_range(start='06/01/2014', end='2018-07-01', freq='M')

In [431]:
dfs = []
for d in date_range:
    dfs.append(compute_ranks(df3, d))

file level stats shape  (103, 7)
file author level stats shape  (103, 5)
merged file shape  (103, 12)
(1, 12)
(1, 21)
(1, 33)
(1, 10)
(1, 43)
(1, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(1, 81)
file level stats shape  (115, 7)
file author level sta

(2, 12)
(2, 21)
(2, 33)
(2, 10)
(2, 43)
(2, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(2, 81)
file level stats shape  (119, 7)
file author level stats shape  (119, 5)
merged file shape  (119, 12)
(2, 12)
(2, 21)
(2, 33)
(2, 10)
(2, 43)
(2, 47)
rankin

file level stats shape  (270, 7)
file author level stats shape  (275, 5)
merged file shape  (275, 12)
(5, 12)
(5, 21)
(5, 33)
(5, 10)
(5, 43)
(5, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(5, 81)
file level stats shape  (337, 7)
file author level sta

(21, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(21, 81)
file level stats shape  (548, 7)
file author level stats shape  (871, 5)
merged file shape  (871, 12)
(24, 12)
(23, 21)
(24, 33)
(22, 10)
(24, 43)
(24, 47)
ranking the following columns  ['num_p

(42, 12)
(41, 21)
(42, 33)
(40, 10)
(42, 43)
(42, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(42, 81)
file level stats shape  (727, 7)
file author level stats shape  (1239, 5)
merged file shape  (1239, 12)
(48, 12)
(47, 21)
(48, 33)
(46, 10)
(48, 43)


file level stats shape  (874, 7)
file author level stats shape  (1691, 5)
merged file shape  (1691, 12)
(80, 12)
(79, 21)
(80, 33)
(77, 10)
(80, 43)
(80, 47)
ranking the following columns  ['num_people_who_committer_their_commits', 'num_people_who_authored_commits_they_commited', 'num_author_commits', 'num_commiter_commits', 'total_additions', 'total_deletions', 'average_net_change', 'mean_of_code_complexity_median', 'code_complexity_max', 'num_unique_files_changed', 'num_total_diffs', 'num_commits', 'num_renames', 'num_deletions', 'num_new_files', 'net_code', 'pct_total_additions', 'pct_total_deleted', 'pct_total_lines', 'number_different_files_changed', 'avg_num_distinct_authors_per_file', 'average_pct_additions', 'avg_pct_deletions', 'avg_file_age', 'total_files_solo_authored', 'total_files_with_max_complexity', 'more_than_50pct_additions', 'total_files_more_than_50pct_deletions', 'total_files_with_last_change', 'days_involved']
(80, 81)
file level stats shape  (881, 7)
file author 

In [433]:
rank_df = pd.concat(dfs, axis=0, join='outer').reset_index(drop=True).sort_values(['developer_id', 'rank_time'])
# check to make sure no index missing
print(rank_df.shape)

print(rank_df.loc[pd.isnull(rank_df.developer_id)].shape)

(1321, 81)
(0, 81)


In [434]:
if EXPORT_CSV:
    rank_df.to_csv(os.path.join(DROPBOX_DIR, 'developer_metrics_over_time.csv'))

## Developer Ranking by Code Contribution

* rank column - this is a developer's overall rank by contributions to the specific repository
* todal diffs - total number of diffs committed to the repo
* total_additions - total number of lines added to the code base
* total_deletions - total lines deleted from the code base
* average_net_change - average net change by lines of code in their contributions
* mean_of_code_complexity_median - average median code complexity of lines added
* average_max_code_complexity - average (per diff) of the max code complexity of lines added
* avg_num_unique_files_changed - average unique files changed per diff
* avg_num_edit_locations - average edit locations per diff. Can be interpreted as number of unique code changes per diff
* num_renames - number of files renamed
* num_deletions - number of files deleted
* num_new_files - number of files created
* avg_net_code_per_diff - average additions-deletions of code per diff
* first_author_engagement - date of first engagement as a commit author with repo
* last_author_engagement - date of last engagement as a commit author with repo
* num_authored_commits - number of commits authored 
* first_commiter_engagement - date of first engagement as a committer with repo
* last_commiter_engagement - date of last engagement as a committer with repo
* num_commiter_commits - number of commits committed
* people_who_committer_their_commits' - list of author ids of distinct people who committed commits they authored
* people_who_authored_commits_they_commited' - ids of distinct people who authored commits they commited
* email - github account email
* name - github account name
* num_other_commiters - number of people who commited things they authored
* num_other_authors - number of people who authored commits they commited
* len_author_engagement - time in days between first and last engagement as commit author
* len_commiter_engagement - length of time in days of engagement as a commiter
* number_different_files_changed - total number of different files changed
* avg_num_distinct_authors_per_file - for files they changed, average number of developers contributing to that file
* average_pct_additions - for files they have worked on, average percent of total lines they added
* avg_pct_deletions - for files they have worked on, average percent of total lines they deleted
* avg_file_age - average days since file they worked on was last changed
* total_files_solo_authored - total number of files they are solo author of
* total_files_with_max_complexity - total number of files they contributed code equal to the max complexity of the file
* total_files_more_than_50pct_additions - total number of files where they contributed more than 50% of the code
* total_files_more_than_50pct_deletions- total number of files where they deleted more than 50% of the code
* total_files_with_last_change - total number of files where their change was last one
* type_first_engagement - first type of engagement (as author or commiter)
* type_last_engagement - last type of engagement (as author or commiter)
* time_involved - days between first/last engagement of any type
       
  
  
After computing these numerical stats, I rank each developer on the stat, then I take an equal weighted average of the ranks for the "avg_rank") and then rank developers on their average rank to figure out who has the best constribution over time. 