# Commit Author Extraction
This notebook exists to get a unique listing of every author that committed code

In [72]:
import pandas as pd

# We'll start with the Commits.csv that was created in GitDataPrep.ipynb
df = pd.read_csv('Commits.csv')
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
df.head()

Unnamed: 0,ID,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,...,num_deletes,num_inserts,net_lines,num_files,branches,files,parents,dmm_unit_size,dmm_unit_complexity,dmm_unit_interfacing
0,0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,868,main,".gitattributes, .gitignore, BuildToolsVersion....",,0.399491,0.611602,0.630582
1,1,76cb2cdf5cc8b6c88ca44b8969153836e589df04,Get a working build (#1)\n\n* Set missing exec...,Sandy Armstrong,sanfordarmstrong@gmail.com,2018-05-04 12:47:21-07:00,25200,Eric Erhardt,eric.erhardt@microsoft.com,2018-05-04 14:47:21-05:00,...,27,1749,1722,23,main,"Microsoft.ML.sln, build.sh, init-tools.sh, run...",f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,,,
2,2,972f6232de173b5e294a34a847682e9b1e67d3af,Fixed the syntax of cited example. (#2),Zeeshan Ahmed,38438266+zeahmed@users.noreply.github.com,2018-05-04 14:06:13-07:00,25200,Eric Erhardt,eric.erhardt@microsoft.com,2018-05-04 16:06:13-05:00,...,5,4,-1,1,main,README.md,76cb2cdf5cc8b6c88ca44b8969153836e589df04,,,
3,3,cde0d7d18ec9e93bde1d3a53c35f87430ac43fee,Add ML.NET Roadmap (#30)\n\n##Add Roadmap.md f...,Gleb K,glebk@microsoft.com,2018-05-05 01:11:31-07:00,25200,GitHub,noreply@github.com,2018-05-05 01:11:31-07:00,...,1,128,127,3,main,"Microsoft.ML.sln, README.md, ROADMAP.md",972f6232de173b5e294a34a847682e9b1e67d3af,,,
4,4,979418886950e144b2cc561bdc5eb41d382cf829,Update contribution guide and issue/PR templates,Shauheen Zahirazami,shzahira@microsoft.com,2018-05-05 13:47:44-07:00,25200,Shauheen Zahirazami,shzahira@microsoft.com,2018-05-05 13:47:44-07:00,...,0,53,53,3,main,"CONTRIBUTING.md, ISSUE_TEMPLATE.md, PULL_REQUE...",cde0d7d18ec9e93bde1d3a53c35f87430ac43fee,,,


In [73]:
# Next, let's drop columns we don't care about, which is most of them
df.drop(columns=['ID','hash','message','author_date', 'committer_date', 'in_main', 'is_merge', 'num_deletes', 'num_inserts', 'net_lines', 'branches', 'files', 'num_files', 'parents', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing'], inplace=True)
df.head()

Unnamed: 0,author_name,author_email,author_tz,committer_name,committer_email,committer_tz
0,dotnet-bot,dotnet-bot@microsoft.com,25200,Immo Landwerth,immol@microsoft.com,25200
1,Sandy Armstrong,sanfordarmstrong@gmail.com,25200,Eric Erhardt,eric.erhardt@microsoft.com,18000
2,Zeeshan Ahmed,38438266+zeahmed@users.noreply.github.com,25200,Eric Erhardt,eric.erhardt@microsoft.com,18000
3,Gleb K,glebk@microsoft.com,25200,GitHub,noreply@github.com,25200
4,Shauheen Zahirazami,shzahira@microsoft.com,25200,Shauheen Zahirazami,shzahira@microsoft.com,25200


In [74]:
# Now let's create a set of names, E-Mails, and Time Zones, starting with just the authors
df_authors = df.drop(columns=['committer_name', 'committer_email', 'committer_tz'])
df_authors.rename(columns={'author_name': 'name', 'author_email': 'email', 'author_tz': 'timezone'}, inplace=True)
df_authors.head()

Unnamed: 0,name,email,timezone
0,dotnet-bot,dotnet-bot@microsoft.com,25200
1,Sandy Armstrong,sanfordarmstrong@gmail.com,25200
2,Zeeshan Ahmed,38438266+zeahmed@users.noreply.github.com,25200
3,Gleb K,glebk@microsoft.com,25200
4,Shauheen Zahirazami,shzahira@microsoft.com,25200


In [75]:
# Now lets build a different dataset of just committers
df_committers = df.drop(columns=['author_name', 'author_email', 'author_tz'])
df_committers.rename(columns={'committer_name': 'name', 'committer_email': 'email', 'committer_tz': 'timezone'}, inplace=True)
df_committers.head()

Unnamed: 0,name,email,timezone
0,Immo Landwerth,immol@microsoft.com,25200
1,Eric Erhardt,eric.erhardt@microsoft.com,18000
2,Eric Erhardt,eric.erhardt@microsoft.com,18000
3,GitHub,noreply@github.com,25200
4,Shauheen Zahirazami,shzahira@microsoft.com,25200


In [76]:
# Now lets merge those two together and drop duplicated rows
df_unified = pd.concat([df_authors, df_committers])
df_unified.drop_duplicates(inplace=True)
df_unified.head()

Unnamed: 0,name,email,timezone
0,dotnet-bot,dotnet-bot@microsoft.com,25200
1,Sandy Armstrong,sanfordarmstrong@gmail.com,25200
2,Zeeshan Ahmed,38438266+zeahmed@users.noreply.github.com,25200
3,Gleb K,glebk@microsoft.com,25200
4,Shauheen Zahirazami,shzahira@microsoft.com,25200


In [77]:
# Time Zones in PyDriller are stored as seconds from epoch. Let's convert that to hours
df_unified['timezone_hours'] = df_unified['timezone'] / (60 * 60)

print(df_unified['timezone_hours'].sort_values().unique())
df_unified.head()

[-13.   -9.   -8.   -7.   -6.   -5.3  -4.   -3.   -2.   -1.    0.    3.
   4.    5.    6.    7.    8.   10. ]


Unnamed: 0,name,email,timezone,timezone_hours
0,dotnet-bot,dotnet-bot@microsoft.com,25200,7.0
1,Sandy Armstrong,sanfordarmstrong@gmail.com,25200,7.0
2,Zeeshan Ahmed,38438266+zeahmed@users.noreply.github.com,25200,7.0
3,Gleb K,glebk@microsoft.com,25200,7.0
4,Shauheen Zahirazami,shzahira@microsoft.com,25200,7.0


In [78]:
# This code is awful, but guesses a city based on Time Zone. I do this to support a requirement in the project to display data on a map. The data is likely wrong, but it will illustrate that capability

def get_city(row):
    tz = row['timezone_hours']
    if (tz == 10):
        row['city'] = 'Melbourne'
        row['country'] = 'Australia'
    if (tz == 8):
        row['city'] = 'Beijing'
        row['country'] = 'China'
    if (tz == 7):
        row['city'] = 'Bangkok'
        row['country'] = 'Thailand'
    if (tz == 6):
        row['city'] = 'Dhakar'
        row['country'] = 'Senegal'
    if (tz == 5):
        row['city'] = 'New Delhi'
        row['country'] = 'India'
    if (tz == 4):
        row['city'] = 'Abu Dhabi'
        row['country'] = 'United Arab Emirates'
    if (tz == 3):
        row['city'] = 'Moscow'
        row['country'] = 'Russia'
    if (tz == 0):
        row['city'] = 'London'
        row['country'] = 'United Kingdom'
    if (tz == -1):
        row['city'] = 'Praia'
        row['country'] = 'Cape Verde'
    if (tz == -2):
        row['city'] = 'Nuuk'
        row['country'] = 'Greenland'
    if (tz == -3):
        row['city'] = 'São Paulo'
        row['country'] = 'Brazil'
    if (tz == -4):
        row['city'] = 'St. John\'s'
        row['country'] = 'Canada'
    if (tz == -5):
        row['city'] = 'New York'
        row['state'] = 'NY'
        row['country'] = 'United States'
    if (tz == -5.3):
        row['city'] = 'Indianapolis'
        row['state'] = 'IN'
        row['country'] = 'United States'
    if (tz == -6):
        row['city'] = 'Chicago'
        row['state'] = 'IL'
        row['country'] = 'United States'
    if (tz == -7):
        row['city'] = 'Phoenix'
        row['state'] = 'AZ'
        row['country'] = 'United States'
    if (tz == -8):
        row['city'] = 'Redmond'
        row['state'] = 'WA'
        row['country'] = 'United States'
    if (tz == -9):
        row['city'] = 'Anchorage'
        row['state'] = 'AK'
        row['country'] = 'United States'
    if (tz == -13):
        row['city'] = 'Sydney'
        row['country'] = 'Australia'
    
    return row

# Now let's make up a city for each time zone
df_unified = df_unified.apply(get_city, axis=1)
df_unified.head()

Unnamed: 0,city,country,email,name,state,timezone,timezone_hours
0,Bangkok,Thailand,dotnet-bot@microsoft.com,dotnet-bot,,25200,7.0
1,Bangkok,Thailand,sanfordarmstrong@gmail.com,Sandy Armstrong,,25200,7.0
2,Bangkok,Thailand,38438266+zeahmed@users.noreply.github.com,Zeeshan Ahmed,,25200,7.0
3,Bangkok,Thailand,glebk@microsoft.com,Gleb K,,25200,7.0
4,Bangkok,Thailand,shzahira@microsoft.com,Shauheen Zahirazami,,25200,7.0


In [79]:
# Finally, lets export this as AuthorLocations.csv
df_unified.to_csv('AuthorLocations.csv')