<h1>Produce commits by day</h1>

Use rows in sorted_commits.csv to produce a count of commits for inclusive dates in the table. Each row represents a single commit.

In [1]:
import pandas as pd

In [2]:
sorted_commits = pd.read_csv('sorted_commits.csv')

In [3]:
len(sorted_commits)

2756

In [4]:
sorted_commits.head()

Unnamed: 0.1,Unnamed: 0,date,files,message,person
0,0,2013-02-13 17:50:44-08:00,"['.gitignore', 'bin/spack']",Initial version of spack with one package:...,Todd Gamblin <tgamblin@llnl.gov>
1,1,2013-02-13 17:50:44-08:00,"['.gitignore', 'bin/spack']",Initial version of spack with one package:...,Todd Gamblin <tgamblin@llnl.gov>
2,2,2013-02-13 17:50:44-08:00,"['.gitignore', 'bin/spack']",Initial version of spack with one package:...,Todd Gamblin <tgamblin@llnl.gov>
3,3,2013-02-13 17:50:44-08:00,"['.gitignore', 'bin/spack']",Initial version of spack with one package:...,Todd Gamblin <tgamblin@llnl.gov>
4,4,2013-02-18 23:46:04-08:00,"[',7 +19,7 @@ import spack']","Dependencies now work. Added libelf, libd...",Todd Gamblin <tgamblin@llnl.gov>


In [5]:
import datetime
from dateutil import parser


## Here is starting date

In [6]:

str_date = sorted_commits.loc[0,'date']
dateobj = parser.parse(str_date)  #-08 hours => -28800 seconds
dateobj

datetime.datetime(2013, 2, 13, 17, 50, 44, tzinfo=tzoffset(None, -28800))

In [7]:
starting_day = int(dateobj.strftime('%j'))  #day of year for Feb 13
starting_day

44

In [8]:
starting_year = dateobj.year
starting_year

2013

## Here is ending date

In [9]:
str_date = sorted_commits.loc[len(sorted_commits)-1,'date']
parser.parse(str_date)  #-05 hours => -18000 seconds


datetime.datetime(2016, 1, 1, 17, 26, 49, tzinfo=tzoffset(None, -18000))

In [10]:
ending_year = parser.parse(sorted_commits.loc[len(sorted_commits)-1, 'date']).year  #get ending year from last row
ending_year

2016

## wrangling code

Goal: go through everyday between (2013, 2, 13) and (2016, 1, 1). For each day count how many commits occured, 0 is possible. Produce a list of commits per day.

Actual method: loop through rows of table. Keep values needed to (a) count rows with same date, (b) count days skipped leading to a sequence of 0 entries, and (c) determine when switch years so can reset values.

In [11]:
current_day = starting_day    #computed above
current_year = starting_year  #computed above
commits_by_day = []           #where final sequence will be kept
day_commits = 0               #count the commits for a single day

for i in range(len(sorted_commits)):
    str_date = sorted_commits.loc[i,'date']
    dateobj = parser.parse(str_date)
    year = dateobj.year
    day_of_year = int(dateobj.strftime('%j'))
    
    #check if change years, e.g., change from 2013 to 2014
    if year!=current_year:
        current_year = year
        diff = day_of_year + (365 - current_day)  #account for skipped days at end of old year
    else:
        diff = day_of_year - current_day
        
    #diff now holds number of days skipped
    
    #No diff so same day - increment commits for the day
    if diff==0:
        day_commits += 1
        continue
    
    #diff = 1 so tomorrow is here :)
    if diff == 1:
        commits_by_day.append(day_commits)  #save accumulation from previous day
        day_commits = 1
        current_day = day_of_year
        continue
    
    #what if days skipped - need to fill in with 0
    if diff > 1:
        commits_by_day.append(day_commits)  #add the ones already counted
        for i in range(diff-1):
            commits_by_day.append(0)  #fill in 0 for missing days 
        day_commits = 1
        current_day = day_of_year
        continue
    
    raise Exception  #should never get here

commits_by_day.append(day_commits)  #get the last one

    



In [12]:
len(sorted_commits)

2756

In [13]:
sum(commits_by_day)

2756

In [14]:
commits_by_day

[4,
 0,
 0,
 0,
 0,
 8,
 12,
 8,
 24,
 4,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8,
 0,
 0,
 12,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 20,
 0,
 0,
 12,
 4,
 0,
 4,
 0,
 16,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 12,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 12,
 0,
 0,
 0,
 8,
 0,
 0,
 0,
 4,
 0,
 8,
 4,


In [15]:
commits_by_day.count(0)  #roughly 25% of days have no commits

689

In [16]:
import json
with open('commit_counts.txt', 'w') as f:
    f.write(json.dumps(commits_by_day))

#Now read the file back into a Python list object
with open('commit_counts.txt', 'r') as f:
    a = json.loads(f.read())
    
len(a) == len(commits_by_day)

True

In [17]:
len(a)

1053