<h1>Produce commits by day</h1>

Use rows in sorted_commits.csv to produce a count of commits for inclusive dates in the table. Each row represents a single commit.

In [5]:
project = 'spack'

In [6]:
import pandas as pd

In [7]:
sorted_commits = pd.read_csv('sorted_'+project+'_commits.csv')

In [8]:
len(sorted_commits)

10913

In [9]:
sorted_commits.head(50)

Unnamed: 0,day_name,day_of_month,doy,files,message,month,name,utc_offset,year
0,Wednesday,13,44,"[b'.gitignore', b'bin/spack']",Initial version of spack with one package:...,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
1,Monday,18,49,"[b'4 +1,4 @@']",Require python2.7\n,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
2,Monday,18,49,"[b',7 +19,7 @@ import spack']","Dependencies now work. Added libelf, libd...",2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
3,Tuesday,19,50,"[b',7 +73,8 @@ for var in [""LD_LIBRARY_PATH"", ...",Fixed passing of dependence prefixes to cc...,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
4,Tuesday,19,50,"[b'28 +4,29 @@ import os']","Fixes, remove parallel build for libdwarf ...",2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
5,Tuesday,19,50,"[b',6 +10,7 @@ def get_path(name):']","rpaths for dependencies. elf, dwarf, cmak...",2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
6,Wednesday,20,51,"[b',9 +65,9 @@ class MakeExecutable(Executable...",Fixed bug in parallel make option.\n,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
7,Wednesday,20,51,"[b'12 +1,10 @@']",Added libunwind and fixed link issues in c...,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
8,Thursday,21,52,"[b'5,6 +315,11 @@ class Package(object):']",Better handling of stage.\n - better sy...,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013
9,Thursday,21,52,"[b'1,6 +131,9 @@ class Package(object):']",Parallel bootstrap for cmake.\n,2,b'Todd Gamblin <tgamblin@llnl.gov>',57600,2013


In [10]:
import datetime
from dateutil import parser


## Here is starting date

In [12]:
starting_year = sorted_commits.loc[0,'year']
starting_month = sorted_commits.loc[0,'month']
starting_day = sorted_commits.loc[0,'day_of_month']
starting_obj = datetime.date(starting_year, starting_month, starting_day)
starting_obj

datetime.date(2013, 2, 13)

## Here is ending date

In [13]:
ending_year = sorted_commits.iloc[-1]['year']
ending_month = sorted_commits.iloc[-1]['month']
ending_day = sorted_commits.iloc[-1]['day_of_month']
ending_obj = datetime.date(ending_year, ending_month, ending_day)
ending_obj

datetime.date(2019, 7, 3)

We should end up with a list of this length, item for each day.

In [14]:
td = ending_obj - starting_obj
td.days

2331

## wrangling code

Goal: go through everyday between starting and ending dates. For each day count how many commits occured, 0 is possible. Produce a list of commits per day.

Actual method: loop through rows of table. Keep values needed to (a) count rows with same date, (b) count days skipped leading to a sequence of 0 entries, and (c) determine when switch years so can reset values.

In [15]:
current_day = int(sorted_commits.loc[0,'doy'])  #day of year: 1-365 (or 366 on leap years)
current_year = sorted_commits.loc[0,'year']

commits_by_day = []           #where final sequence will be kept
day_commits = 0               #count the commits for a single day
dnint = {'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday':7}

for i in range(len(sorted_commits)):

    #pull out data pieces
    year = int(sorted_commits.loc[i,'year'])
    day_of_year = int(sorted_commits.loc[i,'doy'])
    
    #check if change years, e.g., change from 2013 to 2014
    if year!=current_year:
        current_year = year
        diff = day_of_year + (365 - current_day)  #account for skipped days at end of old year
    else:
        diff = day_of_year - current_day
    
    #diff now holds number of days incremented
    
    #No diff so same day - increment commits for the day
    if diff==0:
        day_commits += 1
        continue
    
    #Now things are interesting. We need to move back in time to beginning edge of gap. If gap is size diff,
    #then move back diff days. That will give us the date before the gap begins.
    
    #First build date object - easier to do arithmetic on. This is date on ending edge of gap.
    month = int(sorted_commits.loc[i,'month'])
    day_of_month = int(sorted_commits.loc[i,'day_of_month'])
    end_gap_date = datetime.datetime(year, month, day_of_month)   #current row we are looking at
    
    begin_gap_date = end_gap_date - datetime.timedelta(days=diff) #looking back in time
    
    #First store commits that have been accumulating.
    prior_day_name = dnint[begin_gap_date.strftime('%A')]  #convert to int 1-7
    prior_month = begin_gap_date.month
    prior_day_of_month = begin_gap_date.day

    #record embedding for begin gap date
    embedding_data = [prior_day_name, prior_month, prior_day_of_month]
    the_embedding = embedding_data + [day_commits]
    commits_by_day.append(the_embedding)
    
    #Whew. Took care of recording data for the beginning data of gap. Now to fill in gap.
    
    #diff = 1 so tomorrow is here :) Just reset things since no dates skipped
    if diff == 1:
        day_commits = 1
        current_day = day_of_year
        continue
    
    #we have a gap! need to fill in with 0 embedding for each
    if diff > 1:
        date_obj = begin_gap_date
        for i in range(diff-1):
            date_obj += datetime.timedelta(days=1)  #handles month change overs
            day_name = dnint[date_obj.strftime('%A')]
            the_embedding = [day_name, date_obj.month, date_obj.day, 0]
            commits_by_day.append(the_embedding)  #fill in 0 for missing days 
        day_commits = 1  #record the new one we just saw for data at end of gap
        current_day = day_of_year  #now on new date
        continue
    
    print((i, day_of_year, year, diff))
    raise Exception  #should never get here

#check if have accumumlation before ending.
if day_commits:
    embedding_data = [prior_day_name, prior_month, prior_day_of_month]
    the_embedding = embedding_data + [day_commits]
    commits_by_day.append(the_embedding) 


    



In [16]:
len(sorted_commits)

10913

In [17]:
len(commits_by_day)

2331

In [18]:
commits_by_day[:10]  #3=wednesday

[[3, 2, 13, 1],
 [4, 2, 14, 0],
 [5, 2, 15, 0],
 [6, 2, 16, 0],
 [7, 2, 17, 0],
 [1, 2, 18, 2],
 [2, 2, 19, 3],
 [3, 2, 20, 2],
 [4, 2, 21, 6],
 [5, 2, 22, 1]]

In [19]:
just_commits = [rec[3] for rec in commits_by_day]

In [20]:
max(just_commits)  #49

49

In [None]:
n = len(just_commits)
for i in range(max(just_commits)+1):
    print((i, just_commits.count(i)/n))

In [None]:
fee_fie_foo()  #break here from Run All to see if want to save

In [21]:
import json
with open('commit_'+project+'_counts.txt', 'w') as f:
    f.write(json.dumps(commits_by_day))

#Now read the file back into a Python list object
with open('commit_'+project+'_counts.txt', 'r') as f:
    a = json.loads(f.read())
    
len(a) == len(commits_by_day)

True

In [22]:
len(a)

2331