In [1]:
import numpy as np
import pandas as pd
import collections
import funcy
import os
import re
from dev import LOCAL_DB, DATA_DIR
from models import Commit, Change, Developer, Diff

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
FIGURE_DIR = os.path.join(DATA_DIR, 'figures')

In [3]:
from helpers import connect_to_db

In [4]:
session, engine = connect_to_db(LOCAL_DB)

  """)


In [5]:
# calculate number of commits per developer over time - get a dataset of filename new,
# filename old, commit id, commit date, commit author, committer



In [6]:
pd_dict = {
    'filename_new': [],
    'filename_old': [],
    'commit_id': None,
    'additions': 0,
    'deletions': 0,
    'is_deletion': False,
    'is_rename': False,
    'is_new': False,
    'commit_timestamp': None,
    'commit_subject': '',
    'commit_body': '',
    'author_id': None,
    'commit_id': None,
}

In [7]:
# calculate the number of unique filenames
# filename_tups = session.query().with_entities(
#     Diff.filename_new, Diff.filename_old, Diff.commit_id, 
#         Diff.additions, Diff.deletions, Diff.is_deletion, Diff.is_rename, 
#             Diff.is_new).all()

filename_tups = session.query(Diff).all()

In [8]:
diff_df = pd.DataFrame(list(map(lambda x: x.__dict__, filename_tups)))
diff_df.rename(columns={'id': 'diff_id'}, inplace=True)
diff_df2 = diff_df.drop_duplicates(subset=['filename_new', 'filename_old', 'commit_id'])

In [9]:
print(diff_df.shape)
print(diff_df2.shape)

(10762, 12)
(4739, 12)


In [10]:
del diff_df2['_sa_instance_state']

In [11]:
commits_df = pd.DataFrame(list(map(lambda x: x.__dict__, session.query(Commit).all())))
commits_df.rename(columns={'id': 'commit_id'}, inplace=True)

In [12]:
del commits_df['_sa_instance_state']
del commits_df['raw_text']

In [13]:
df = pd.merge(left=diff_df2, right=commits_df, on='commit_id')
print(df.shape)

(4739, 20)


In [14]:
# merge in author information
author_df = pd.DataFrame(list(map(lambda x: x.__dict__, session.query(Developer).all())))

In [15]:
del author_df['_sa_instance_state']

In [16]:
# merge in author
auth2 = author_df.rename(columns={'email': 'author_email', 'id': 'author_id'})
df2 = df.merge(right=auth2[['author_email', 'author_id']], on='author_id')
auth3 = author_df.rename(columns={'email': 'commiter_email', 'id': 'commiter_id'})
df3 = df2.merge(right=auth3[['commiter_email', 'commiter_id']], on='commiter_id')


In [17]:
del auth2, auth3

In [18]:
# Here I should create something that looks for file renames and maps old name to new name and creates a new column (stable filename)
print('Number of renames', df3.loc[df3.is_rename==True].shape)
print('Total shape ', df3.shape)


Number of renames (98, 22)
Total shape  (4739, 22)


In [19]:
# for the ones with blank files names, search for "rename from " to "rename to"
def find_renames(raw_diff_text):
    m = re.search(r'rename from (.+\.\w{1,10})(\|\|\|\||\|\|\|)rename to (.+\.\w{1,10})', raw_diff_text)
    if m:
        res = m.groups()
        if len(res) == 3:
            return res[0], res[2]
        return res
    return None

In [20]:
missing_filenames = ((df3.is_rename==1) & (df3.filename_new=='') & (df3.filename_old==''))
print(missing_filenames.shape)

(4739,)


In [21]:
replacement_names = df3['raw_diff'].apply(find_renames)

In [22]:
df3.loc[missing_filenames, 'filename_old'] = replacement_names[missing_filenames].apply(lambda x: x[0])
df3.loc[missing_filenames, 'filename_new'] = replacement_names[missing_filenames].apply(lambda x: x[1])
df3[['is_deletion', 'is_new', 'is_rename']] = df3[['is_deletion', 'is_new', 'is_rename']].astype(int)

In [23]:
# test to see if fillin gin the renames worked
# df3.loc[missing_filenames, ['filename_new', 'filename_old', 'raw_diff']]

In [24]:
# create a set of tups of all filename matches (old, new)
filename_tups = df3[['timestamp', 'filename_old', 'filename_new']].sort_values('timestamp').values

In [25]:
canonical_name_to_other_names_dict = dict()
# dict that goes from canonical names to other file names

other_name_to_canonical_name_dict = dict()
# dict that does from file names to canonical name

for time_val, old_name, new_name in filename_tups:
    if old_name == '/dev/null':
        # this is a file creation
        if new_name not in canonical_name_to_other_names_dict:
            canonical_name_to_other_names_dict[new_name] = [new_name]
            other_name_to_canonical_name_dict[new_name] = new_name
            
    elif new_name == '/dev/null':
        # this is a file deletion
        # need to check that it exists in the dict because not totally capturing everything perfectly
        if old_name not in other_name_to_canonical_name_dict:
            # we haven't done anything with this filename yet
            canonical_name_to_other_names_dict[old_name] = [old_name]
            other_name_to_canonical_name_dict[old_name] = old_name
            
    elif new_name==old_name:
        # new name is the same as old_name
         if new_name not in other_name_to_canonical_name_dict:
            # we haven't done anything with this filename yet
            canonical_name_to_other_names_dict[new_name] = [new_name]
            other_name_to_canonical_name_dict[new_name] = new_name
    elif new_name != old_name:
        # and neither new name nor old name is dev/null
        if old_name in other_name_to_canonical_name_dict:
            can_name = other_name_to_canonical_name_dict[old_name]
            if can_name not in canonical_name_to_other_names_dict:
                can_name = other_name_to_canonical_name_dict[can_name]
            canonical_name_to_other_names_dict[can_name].append(new_name)
            other_name_to_canonical_name_dict[new_name] = old_name
        else:
            # this is a weird pair
            print(old_name, new_name) 
            other_name_to_canonical_name_dict[old_name] = old_name
            other_name_to_canonical_name_dict[new_name] = old_name
            canonical_name_to_other_names_dict[old_name] = [new_name]
            

tests/test-repo/blocks/1220c0fc/1220c0fc6b49543d7bf04e83d2a5a7cbe72a83e80f9c7bca1abcaa42298a57a33ff5.data tests/repo-example/blocks/1220c0fc/1220c0fc6b49543d7bf04e83d2a5a7cbe72a83e80f9c7bca1abcaa42298a57a33ff5.data
tests/repo-example/blocks/1220933b/1220933b41d37fd4508cdff45930dff56baef91c7dc345e73d049ab570abe10dfbb9.data test/go-ipfs-repo/blocks/1220933b/1220933b41d37fd4508cdff45930dff56baef91c7dc345e73d049ab570abe10dfbb9.data
test/http-api/test-bitswap.js test/http-api/ipfs-api/test-bitswap.js
test/go-ipfs-repo/blocks/1220b0cb/1220b0cba7371f11461f77081b947d837cc9a3b80e182ac123bbd427563e8b167c96.data test/go-ipfs-repo/blocks/CIQLB/CIQLBS5HG4PRCRQ7O4EBXFD5QN6MTI5YBYMCVQJDXPKCOVR6RMLHZFQ.data
test/http-api/spec/test-pubsub.js test/http-api/spec/pubsub.js
test/http-api/spec/test-config.js test/http-api/spec/config.js
test/go-ipfs-repo/blocks/CIQER/CIQERMRAAFXUAUOX3V2DCW7R77FRIVHQ3V5OIPPS3XQBX34KRPNOIRQ.data test/test-data/go-ipfs-repo/blocks/CIQER/CIQERMRAAFXUAUOX3V2DCW7R77FRIVHQ3V5OIPPS

In [26]:
# test canonical dictionary
multiple_renames = funcy.select_values(lambda x: len(x) > 1, canonical_name_to_other_names_dict)
for old_name, new_name in df3.loc[df3.is_rename==1, ['filename_old', 'filename_new']].values:
    if old_name in canonical_name_to_other_names_dict:
        if new_name not in canonical_name_to_other_names_dict[old_name]:
            res = funcy.select_values(lambda x: new_name in x, multiple_renames)
            if len(res) > 1:
                print(res)
            else:
                print('Nothing found')
                print(old_name, new_name)
            

Nothing found
test/core-tests/test-init.js test/core/both/test-init.js
Nothing found
examples/basics/index.js /dev/null


In [27]:
# find unique filenames, then loop through all unique filenames, if part of a rename, 
# append to the dict associated with that filename. if entries in that dictionary, if a rename to one of those, associate with that

canonical_name_to_other_names_dict[
    'test/core-tests/test-init.js'] = ['test/core-tests/test-init.js', 'test/core/both/test-init.js']

In [28]:
# create a skeleton of all names in x axis mapped to canonical name (invert dictionary)
names_final = dict()

for k, v in canonical_name_to_other_names_dict.items():
    for i in v:
        names_final[i] = k
            

In [29]:
# map these names onto the data set
def get_canonical_name(new_name, old_name, name_dict):
    if new_name == '/dev/null':
        return name_dict[old_name]
    elif old_name == '/dev/null':
        return name_dict[new_name]
    else:
        if old_name in name_dict:
            return name_dict[old_name]
        elif new_name in name_dict:
            return name_dict[new_name]
        print('Bad Case ', new_name, old_name)
        return new_name
    

In [30]:
df3['canonical_name'] = df3.apply(
    lambda x: get_canonical_name(x['filename_new'], x['filename_old'], names_final), axis=1)

### TO DO
I need to deal with the missing filenames - this is an example of a bigger issue, but only 164 at the moment
so will continue writing code to generate graphs

Also need to fix duplication issue in the database for Diffs

In [31]:
from gitlog_parser import is_raw_diff

def code_complexity(text_str):
    return len(text_str) - len(text_str.lstrip())
    
def analyze_diff(text_str):
    lines = text_str.split('||||')
    if len(lines)==1:
        lines = text_str.split('|||')
    complexity = []
    for line in lines:
        if line.startswith('+'):
            complexity.append(code_complexity(line[1:]))
    return complexity

    

In [32]:
df3['code_complexity'] = df3.raw_diff.apply(analyze_diff)
df3['code_complexity_max'] = df3.code_complexity.apply(lambda x: 0 if len(x)==0 else np.max(x))
df3['code_complexity_min'] = df3.code_complexity.apply(lambda x: 0 if len(x)==0 else np.min(x))
df3['code_complexity_mean'] = df3.code_complexity.apply(lambda x: 0 if len(x)==0 else np.mean(x))
df3['code_complexity_median'] = df3.code_complexity.apply(lambda x: 0 if len(x)==0 else np.median(x))
df3['net_change'] = df3.additions - df3.deletions

# Repo Level Stats

total lines of code added
total lines of code deleted
total number of files add, deleted, renamed
total number of files
total net code

In [33]:
time_stats = df3.sort_values('timestamp').groupby('timestamp').agg({
    'additions': np.sum, 'deletions': np.sum, 'net_change': np.sum,
    'filename_old': 'nunique', 'diff_id': 'count', 
    'is_rename': np.sum, 'is_deletion': np.sum, 'is_new': np.sum}).rename(columns={
        'additions': 'total_lines_added', 
        'deletions': 'total_lines_deleted',
        'net_change': 'total_lines_code',
        'filename_old': 'total_num_unique_files',
        'diff_id': 'total_num_edit_locations',
         'is_rename': 'total_num_renames',
        'is_deletion': 'total_num_deletions',
        'is_new': 'total_num_new_files'})

In [34]:
# this is a cumulative sum along time axis
cum_repo_stats = time_stats.cumsum()

### Commit level code info
calculate amount of net code
plot net code, additions, deletions in each change
calculate number of file changed in each commit 
for each commit, also want total number of change in each commit
for each commit, try to get average change size


### file level
cumulative number of changes
cumulative number of devs who work on them
code churn - number of changes relative to total changes
average code change per change to each file

### file pairs
create count of all files changed together and show which are most commonly changed together


### Developer
number of changes per developer
total lines added/deleted per developer
number of created/deleted/renamed files per dev

create pairs of devs who work together



### Oustanding To Dos
need to fix rename fail in database pipelines
need to add files to ignore
why are tests failing after that change?
code complexity count per diff


# Questions
how to aggregate code complexity?

# Commit level stats
    1) number of files changed per commit

    2) number of edit locations per commit

    3) average code complexity of each diff (note not exactly sure how to aggregate this)
    
    4) Total number of lines changed
    
    5) Committer/author same person

In [35]:
adds = df3.sort_values(['commit_id', 'timestamp']).groupby('commit_id').agg({
    'additions': np.sum, 'deletions': np.sum, 'net_change': np.mean,
     'timestamp': np.min, 'code_complexity_median': np.mean, 
    'filename_old': 'nunique', 'diff_id': 'count', 
    'is_rename': np.sum, 'is_deletion': np.sum, 'is_new': np.sum})

In [36]:
commit_stats = adds.rename(columns={
    'additions': 'total_additions', 
    'deletions': 'total_deletions', 'net_change': 'average_net_change',
    'code_complexity_median':'mean_of_code_complexity_median', 
    'filename_old': 'num_unique_files_changed', 'diff_id': 'num_edit_locations',
     'is_rename': 'num_renames', 'is_deletion': 'num_deletions',
        'is_new': 'num_new_files'})

In [37]:
devs = df3.sort_values('commit_id').drop_duplicates('commit_id')[[
    'commit_id', 'author_id', 'commiter_id', 'author_email', 'commiter_email', 'commit_body',
    'git_hash', 'repo_id', 'sha1', 'subject']]
commit_stats2 = commit_stats.merge(devs, left_index=True, right_on='commit_id', how='left')

In [38]:
# Count where developer and ocmmiter are different
commit_stats2['diff_dev_commit'] = commit_stats2.apply(
    lambda x: 0 if x['author_email']==x['commiter_email'] else 1, axis=1)

In [39]:
commit_stats2.shape

(1313, 21)

# Developer Level Stats

    min time of engagement
    last time of engagement
    collaborators who have commited them
    people who have worked on the same file
    total lines of code commited as a fraction of total lines of code
    average change additions/deletions
    number renames
    number new files
    number deletions
    type of first engagement

In [40]:
auths = df3.sort_values(['author_id', 'timestamp']).groupby(['author_id', 'timestamp']).agg({
    'additions': np.sum, 'deletions': np.sum, 'net_change': np.mean,
    'code_complexity_median': np.mean, 
    'filename_old': 'nunique', 'diff_id': 'count', 
    'is_rename': np.sum, 'is_deletion': np.sum, 'is_new': np.sum}).rename(columns={
        'additions': 'total_additions', 
        'deletions': 'total_deletions', 'net_change': 'average_net_change',
        'code_complexity_median':'mean_of_code_complexity_median', 
        'filename_old': 'num_unique_files_changed', 'diff_id': 'num_edit_locations',
        'is_rename': 'num_renames', 'is_deletion': 'num_deletions',
        'is_new': 'num_new_files'})
auths['net_code'] = auths['total_additions'] - auths['total_deletions']
auths_cum = auths.groupby('author_id').cumsum()

In [43]:
auths2 = auths.merge(cum_repo_stats, left_index=True, right_index=True, how='left')
auths2['pct_total_additions'] = auths2['total_additions']/auths2['total_lines_added']
auths2['pct_total_deleted'] = auths2['total_deletions']/auths2['total_lines_deleted']
auths2['pct_total_lines'] = auths2['net_code']/auths2['total_lines_code']


In [44]:
auths_cum2 = auths_cum.merge(cum_repo_stats, left_index=True, right_index=True, how='left')
auths_cum2['pct_total_additions'] = auths_cum2['total_additions']/ auths_cum2['total_lines_added']
auths_cum2['pct_total_deleted'] = auths_cum2['total_deletions']/ auths_cum2['total_lines_deleted']
auths_cum2['pct_total_lines'] = auths_cum2['net_code']/ auths_cum2['total_lines_code']
auths_cum2.columns = ['cum_{}'.format(c) for c in auths_cum2.columns]

In [45]:
auth_time = df3.sort_values(
    ['author_id', 'timestamp']).groupby(['author_id']).agg({'timestamp': [np.min, np.max]})
auth_time.columns = auth_time.columns.get_level_values(1)
auth_time2 = auth_time.rename(columns={
    'amin': 'first_author_engagement', 'amax':'last_author_engagement'})

In [46]:
auths3 = auths2.merge(auth_time2, left_index=True, right_index=True, how='left')
auths4 = auths2.merge(auths_cum2, left_index=True, right_index=True, how='left')

In [47]:
# collaborations stats
# for each author, I want to keep track of who commits their diffs
# who else works on the same files as them
author_comm_pairs = df3[['author_id', 'author_email', 'commiter_id', 'commiter_email', 'timestamp']].values

In [48]:
# filter for pairs that are different
collabs = list(filter(lambda x: x[0]!=x[2], author_comm_pairs))

In [49]:
# for each author, find number of commits
collab_df = pd.DataFrame(
    collabs, columns=['author_id', 'author_email', 'commiter_id', 'commiter_email', 'timestamp']).drop_duplicates()


In [50]:
collab2 = collab_df[['author_id', 'commiter_id']].sort_values('author_id').drop_duplicates().values
d = collections.defaultdict(list)
c = collections.defaultdict(list)
for auth_id, collab in collab2:
    d[auth_id].append(collab)
    c[collab].append(auth_id)
    

In [51]:
auths5 = auths4.merge(
    collab_df.groupby('author_id').agg(
        {'commiter_id': 'nunique'}).rename(columns={'commiter_id': 'num_different_commiters'}),
    left_index=True, right_index=True, how='left').fillna({'num_different_commiters': 0})

In [52]:
auths5.num_different_commiters.min()

0.0

In [53]:
# Look at commit behavior
comms = df3.sort_values(['commiter_id', 'timestamp']).groupby(['commiter_id', 'timestamp']).agg({
    'code_complexity_median': np.mean, 
    'filename_old': 'nunique', 'diff_id': 'count', 
    'is_rename': np.sum, 'is_deletion': np.sum, 'is_new': np.sum}).rename(columns={
        'code_complexity_median':'mean_of_code_complexity_median', 
        'filename_old': 'num_unique_files_changed', 'diff_id': 'num_edit_locations',
        'is_rename': 'num_renames', 'is_deletion': 'num_deletions',
        'is_new': 'num_new_files'})
comms['net_code'] = auths['total_additions'] - auths['total_deletions']
comms_cum = comms.groupby('commiter_id').cumsum()

In [54]:
# now look at all developers involved, plot number of authored comms, diffed comms, other collaborators, min auth, max auth, min comm, max comm
comm_time = df3.sort_values(
    ['commiter_id', 'timestamp']).groupby(['commiter_id']).agg({'timestamp': [np.min, np.max],
                                                               'commit_id': 'nunique'})
comm_time.columns = comm_time.columns.get_level_values(1)
comm_time2 = comm_time.rename(columns={
    'amin': 'first_commiter_engagement', 'amax':'last_commiter_engagement', 'nunique': 'num_commiter_commits'})

In [55]:
auth_time = df3.sort_values(
    ['author_id', 'timestamp']).groupby(['author_id']).agg({
    'timestamp': [np.min, np.max], 
    'commit_id': 'nunique'})
auth_time.columns = auth_time.columns.get_level_values(1)
auth_time2 = auth_time.rename(columns={
    'amin': 'first_author_engagement', 'amax':'last_author_engagement', 'nunique': 'num_authored_commits'})

In [56]:
# merge commiter and author time info
dev = auth_time2.merge(comm_time2, left_index=True, right_index=True, how='outer')

In [57]:
dev['people_who_committer_their_commits'] = dev.index.map(lambda x: d[x])

In [58]:
dev['people_who_authored_commits_they_commited'] = dev.index.map(lambda x: c[x])

In [100]:
def define_first_engagement(x):
    
    if pd.isnull(x['first_author_engagement']):
        return 'commiter'
    if pd.isnull(x['first_author_engagement']):
        return 'author'
    if x['first_author_engagement'] <= x['first_commiter_engagement']:
        return 'author'
    return 'commiter'

In [101]:
pd.isnull(dev.loc[16, 'first_author_engagement'])

False

In [102]:
dev['type_first_engagement'] = dev.apply(define_first_engagement, axis=1)

# File Level Stats

 1) Rate of changes
 
 2) Number of developers who have worked on the file (over time)
 
 3) average size of code per change
 
 4) Other files changed with the file
 
 5) Person who has contributed most to the file
 
 6) Number of file renames
    
 7) when it appeared
 
 8) orginal author
 
 9) files it is usually changed with


In [113]:
file_time = df3.sort_values(
    ['canonical_name', 'timestamp']).groupby(['canonical_name',]).agg({'timestamp': [np.min, np.max]})
file_time.columns = file_time.columns.get_level_values(1)
file_time2 = file_time.rename(columns={
    'amin': 'first_appearance', 'amax':'last_appearance'})

In [119]:
files = df3.sort_values(['canonical_name', 'timestamp']).groupby(
    ['canonical_name', 'timestamp']).agg({
        'additions': np.sum, 'deletions': np.sum, 'net_change': np.sum,
        'diff_id': 'nunique',
        'is_rename': np.sum, 
        'is_deletion': lambda x: any(x)}).rename(columns={
        'additions': 'total_additions', 
        'deletions': 'total_deletions', 
        'net_change': 'total_net_change',
        'is_rename': 'num_renames',
        'is_deletion': 'is_deleted',
        'diff_id': 'num_changes'})

In [114]:
# get first author, first commit, first commiter


In [None]:
# to get dev who contributed most, groupby filename, dev
# then you can also count distinct developers

In [115]:
# get 

In [120]:
files

Unnamed: 0_level_0,Unnamed: 1_level_0,total_additions,total_deletions,total_net_change,num_changes,num_renames,is_deleted
canonical_name,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,2014-05-29 06:27:26,0.0,0.0,0.0,1,0,False
,2014-05-30 11:45:54,0.0,0.0,0.0,1,0,False
,2014-06-06 09:00:43,0.0,0.0,0.0,1,0,True
,2014-06-14 08:51:15,0.0,0.0,0.0,1,0,False
,2014-06-15 21:31:07,0.0,0.0,0.0,1,0,False
,2014-06-17 06:02:24,0.0,0.0,0.0,1,0,False
,2014-07-10 15:17:25,0.0,0.0,0.0,1,0,False
,2014-08-16 18:26:16,0.0,0.0,0.0,1,0,False
,2014-08-16 20:26:42,0.0,0.0,0.0,1,0,False
,2015-07-05 19:11:42,0.0,0.0,0.0,1,0,True


In [122]:
cum_files = files.groupby('canonical_name').cumsum().rename(columns={'total_additions': 'cum_total_additions',
                                                        'total_deletions': 'cum_total_deletions',
                                                        'num_changes': 'cum_num_changes',
                                                        'num_renames': 'cum_num_renames',
                                                        'is_deleted': 'cum_is_deleted'})


In [123]:
cum_files

Unnamed: 0_level_0,Unnamed: 1_level_0,cum_total_additions,cum_total_deletions,total_net_change,cum_num_changes,cum_num_renames,cum_is_deleted
canonical_name,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,2014-05-29 06:27:26,0.0,0.0,0.0,1,0,0.0
,2014-05-30 11:45:54,0.0,0.0,0.0,2,0,0.0
,2014-06-06 09:00:43,0.0,0.0,0.0,3,0,1.0
,2014-06-14 08:51:15,0.0,0.0,0.0,4,0,1.0
,2014-06-15 21:31:07,0.0,0.0,0.0,5,0,1.0
,2014-06-17 06:02:24,0.0,0.0,0.0,6,0,1.0
,2014-07-10 15:17:25,0.0,0.0,0.0,7,0,1.0
,2014-08-16 18:26:16,0.0,0.0,0.0,8,0,1.0
,2014-08-16 20:26:42,0.0,0.0,0.0,9,0,1.0
,2015-07-05 19:11:42,0.0,0.0,0.0,10,0,2.0


In [72]:
df3.columns

Index(['additions', 'commit_id', 'deletions', 'filename_new', 'filename_old',
       'filetype', 'diff_id', 'is_deletion', 'is_new', 'is_rename', 'raw_diff',
       'author_id', 'commit_body', 'commiter_id', 'date_time', 'git_hash',
       'repo_id', 'sha1', 'subject', 'timestamp', 'author_email',
       'commiter_email', 'canonical_name', 'code_complexity',
       'code_complexity_max', 'code_complexity_min', 'code_complexity_mean',
       'code_complexity_median', 'net_change'],
      dtype='object')

In [439]:
df3.loc[df3['canonical_name']=='', 'raw_diff'][56]

'diff --git a/commands/init.js b/commands/init.js|||new file mode 100644|||index 0000000..e69de29'

Unnamed: 0,additions,commit_id,deletions,filename_new,filename_old,filetype,diff_id,is_deletion,is_new,is_rename,...,timestamp,author_email,commiter_email,canonical_name,code_complexity,code_complexity_max,code_complexity_min,code_complexity_mean,code_complexity_median,net_change
4725,,1682,,,/dev/null,,5774,False,True,False,...,2014-05-29 06:27:26,juan@benet.ai,juan@benet.ai,,"[0, 0]",0,0,0.000000,0.0,
4684,,1666,,,,,5730,False,True,False,...,2014-05-30 11:45:54,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
4627,,1643,,,,,5651,True,False,False,...,2014-06-06 09:00:43,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
4577,,1635,,,,,5627,False,True,False,...,2014-06-14 08:51:15,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
4560,,1628,,,,,5602,False,True,False,...,2014-06-15 21:31:07,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
4488,,1603,,,,,5549,False,True,False,...,2014-06-17 06:02:24,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
4398,,1574,,,/dev/null,,5440,False,True,False,...,2014-07-10 15:17:25,juan@benet.ai,juan@benet.ai,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0.000000,0.0,
4736,,1570,,,/dev/null,,5435,False,True,False,...,2014-08-16 18:26:16,andrew@deandrade.com.br,andrew@deandrade.com.br,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0.000000,0.0,
4728,,1569,,,,,739,False,False,False,...,2014-08-16 20:26:42,juan@benet.ai,juan@benet.ai,,[],0,0,0.000000,0.0,
1508,,1561,,,,,5336,True,False,False,...,2015-07-05 19:11:42,daviddias.p@gmail.com,daviddias.p@gmail.com,,[],0,0,0.000000,0.0,


In [None]:
changes_per_file = df3.sort_values('canonical_filename').groupby('canonical_filename').size()
files = df3.groupby('canonical_filename').agg

In [129]:
# plot number of file creations, deletions and renames
file_creations = df3[['timestamp', 'is_deletion', 'is_rename', 'is_new']].set_index(
    'timestamp').sort_index().cumsum().drop_duplicates()

In [None]:
plt.figure(figsize=(20,15))
file_creations.plot()
plt.title('Cumulative File Deletions, New Files and Renames')
plt.savefig(os.path.join(FIGURE_DIR, 'cumulative_file_changes'.png', bbox_inches='tight')
plt.show()

In [None]:
print(new_files.shape)

In [None]:
# create a temporary column called filename_groupby that includes other filename when one value is dev/null
new_files = (df3['filename_old'] == '/dev/null')
df3['groupby_filename'] = df3['filename_old']
df3.loc[new_files, 'groupby_filename'] = df3[new_files]['filename_new']


In [None]:
# df3[pd.isnull(df3['groupby_filename'])]
print(df3[df3['groupby_filename'] == ''].shape)
print(df3.shape)


In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
df3.groupby_filename.value_counts()[:30]

In [None]:
plt.figure(figsize=(20,15))
df3.groupby_filename.value_counts()[:20].plot(kind='bar')
plt.title('Number of Diffs Per File')
plt.savefig('Diffs_per_file.png', bbox_inches='tight')

In [None]:
plt.figure(figsize=(20,15))
df3.author_email.value_counts()[:20].plot(kind='bar')
plt.title('Number of Diffs Per Author')
plt.savefig('Diffs_per_author.png', bbox_inches='tight')

In [None]:
frag = df3.groupby(
    'groupby_filename').agg({'additions': np.sum, 'deletions': np.sum, 'author_id': 'nunique'})
frag2 = frag.rename(columns={
    'additions': 'total_additions', 'deletions': 'total_deletions', 'author_id':'unique_contributors'}).sort_values(
    'unique_contributors', ascending=False)

In [None]:
frag2[:30]

In [None]:
fig, ax1 = plt.subplots(figsize=(20, 15))

color = 'tab:red'
# ax1.set_xlabel('filename')
ax1.set_ylabel('Lines of Code', color=color)
lns1 = ax1.plot(frag2[:30].index, frag2[:30].total_additions, 'r-', label='total_additions')
lns2 = ax1.plot(frag2[:30].index, frag2[:30].total_deletions, 'g-', label='total_deletions')
ax1.tick_params(axis='y', labelcolor=color)
plt.xticks(rotation='vertical')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
ax2.legend()
color = 'tab:blue'
ax2.set_ylabel('number unique contributors', color=color)  # we already handled the x-label with ax1
lns3 = ax2.plot(frag2[:30].index, frag2[:30].unique_contributors, color=color)
ax2.tick_params(axis='y', labelcolor=color)

lns = lns1+lns2+lns3
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc=0)

plt.title('Fragmentation')
plt.show()
fig.savefig('frag.png', bbox_inches='tight')


In [None]:
df.head()

In [None]:
# for each filename, groupby filename
df2 = df.groupby('filename_new')

In [None]:
new_info = []
for fname_new, fname_old, commit_id, _ in filename_tups:
    commit_info = session.query(Commit).filter(Commit.id==commit_id).first()
    new_info.append((
        a, b, commit_info.timestamp,
            commit_info.subject, commit_info.commit_body, commit_info.author_id,
                commit_info.commiter_id, ))
    

In [None]:
session.query(Commit).filter(Commit.id==1529).first().author

In [None]:
session.query(Diff).first().__dict__


In [None]:
filename_tups

# for each of the files changed, I need to know when a change occured, who changed it, how many insertions, deletions did they do, what commit was it part of, what was the commit 
# for each of the new filenames I need to know lines of code per day added
# then, I need to show that by developer
# for code fragementation, I need a running average of total lines of code and the number contributed by different developers
# then for code churn I need to calculate number of changes per file 
# then i want to dvivide dev by first itneraction with repo
# for dependence I either need to look at who builds on one, or I need to look at who create repos and does first commits into a project (feature)
# for each commit, number of files changed can be a proxy
# then, for each dev, I want to be able to check who are the group of people they work on code with or change the same file with
# for complexity, 

## checks
# why do some files have no changes?
# what can I do to add location?
# how can i track renames or deletions
# what about the boilerplate is throwing this off?