# Commit Author Extraction
This notebook exists to get a unique listing of every author that committed code

In [3]:
import pandas as pd

# We'll start with the Commits.csv that was created in GitDataPrep.ipynb
df = pd.read_csv('Commits.csv')
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
df.head()

Unnamed: 0,ID,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,...,is_merge,num_deletes,num_inserts,net_lines,num_files,branches,files,dmm_unit_size,dmm_unit_complexity,dmm_unit_interfacing
0,0,7894423f9bac837f4c5fb2c9a0f4284da38f2069,Initial commit,Rich Lander,rlander@microsoft.com,2017-09-21 16:11:36-07:00,25200,GitHub,noreply@github.com,2017-09-21 16:11:36-07:00,...,False,0,21,21,1,main,LICENSE,,,
1,1,42dd1a3280da0bf901058cd7812faa1355eaae29,Create README.md,Piotr Puszkiewicz,piotrp@microsoft.com,2017-09-21 16:22:28-07:00,25200,GitHub,noreply@github.com,2017-09-21 16:22:28-07:00,...,False,0,2,2,1,main,README.md,,,
2,2,25139110fc53537334c2f2a745246b4fcf8203fb,Updated the readme,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,25200,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,...,False,1,10,9,1,main,README.md,,,
3,3,3a88efed0961f689e692eb3d52b3d9d3ddca903b,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:50:32-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:50:32-07:00,...,False,2,15,13,1,main,README.md,,,
4,4,0278d89a6150858193cee8e6d1ac0ce159ac4ad0,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:51:31-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:51:31-07:00,...,False,1,1,0,1,main,README.md,,,


In [4]:
# Next, let's drop columns we don't care about, which is most of them
df.drop(columns=['ID','hash','message','author_date', 'committer_date', 'in_main', 'is_merge', 'num_deletes', 'num_inserts', 'net_lines', 'branches', 'files', 'num_files','dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing'], inplace=True)
df.head()

Unnamed: 0,author_name,author_email,author_tz,committer_name,committer_email,committer_tz
0,Rich Lander,rlander@microsoft.com,25200,GitHub,noreply@github.com,25200
1,Piotr Puszkiewicz,piotrp@microsoft.com,25200,GitHub,noreply@github.com,25200
2,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200
3,LadyNaggaga,maria.naggaga@live.ca,25200,GitHub,noreply@github.com,25200
4,LadyNaggaga,maria.naggaga@live.ca,25200,GitHub,noreply@github.com,25200


In [5]:
# Now let's create a set of names, E-Mails, and Time Zones, starting with just the authors
df_authors = df.drop(columns=['committer_name', 'committer_email', 'committer_tz'])
df_authors.rename(columns={'author_name': 'name', 'author_email': 'email', 'author_tz': 'timezone'}, inplace=True)
df_authors.head()

Unnamed: 0,name,email,timezone
0,Rich Lander,rlander@microsoft.com,25200
1,Piotr Puszkiewicz,piotrp@microsoft.com,25200
2,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200
3,LadyNaggaga,maria.naggaga@live.ca,25200
4,LadyNaggaga,maria.naggaga@live.ca,25200


In [6]:
# Now lets build a different dataset of just committers
df_committers = df.drop(columns=['author_name', 'author_email', 'author_tz'])
df_committers.rename(columns={'committer_name': 'name', 'committer_email': 'email', 'committer_tz': 'timezone'}, inplace=True)
df_committers.head()

Unnamed: 0,name,email,timezone
0,GitHub,noreply@github.com,25200
1,GitHub,noreply@github.com,25200
2,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200
3,GitHub,noreply@github.com,25200
4,GitHub,noreply@github.com,25200


In [7]:
# Now lets merge those two together and drop duplicated rows
df_unified = pd.concat([df_authors, df_committers])
df_unified.drop_duplicates(inplace=True)
df_unified.head()

Unnamed: 0,name,email,timezone
0,Rich Lander,rlander@microsoft.com,25200
1,Piotr Puszkiewicz,piotrp@microsoft.com,25200
2,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200
3,LadyNaggaga,maria.naggaga@live.ca,25200
6,LadyNaggaga,maria.naggaga@live.ca,14400


In [8]:
# Time Zones in PyDriller are stored as seconds from epoch. Let's convert that to hours
df_unified['timezone_hours'] = df_unified['timezone'] / (60 * 60)

print(df_unified['timezone_hours'].sort_values().unique())
df_unified.head()

[-13.  -11.  -10.3  -9.   -8.   -6.   -3.3  -3.   -2.   -1.    0.    4.
   5.    6.    7.    8. ]


Unnamed: 0,name,email,timezone,timezone_hours
0,Rich Lander,rlander@microsoft.com,25200,7.0
1,Piotr Puszkiewicz,piotrp@microsoft.com,25200,7.0
2,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,25200,7.0
3,LadyNaggaga,maria.naggaga@live.ca,25200,7.0
6,LadyNaggaga,maria.naggaga@live.ca,14400,4.0


In [9]:
# This code is awful, but guesses a city based on Time Zone. I do this to support a requirement in the project to display data on a map. The data is likely wrong, but it will illustrate that capability

def get_city(row):
    tz = row['timezone_hours']
    if (tz == 10):
        row['city'] = 'Melbourne'
        row['country'] = 'Australia'
    if (tz == 8):
        row['city'] = 'Beijing'
        row['country'] = 'China'
    if (tz == 7):
        row['city'] = 'Bangkok'
        row['country'] = 'Thailand'
    if (tz == 6):
        row['city'] = 'Dhakar'
        row['country'] = 'Senegal'
    if (tz == 5):
        row['city'] = 'New Delhi'
        row['country'] = 'India'
    if (tz == 4):
        row['city'] = 'Abu Dhabi'
        row['country'] = 'United Arab Emirates'
    if (tz == 3):
        row['city'] = 'Moscow'
        row['country'] = 'Russia'
    if (tz == 0):
        row['city'] = 'London'
        row['country'] = 'United Kingdom'
    if (tz == -1):
        row['city'] = 'Praia'
        row['country'] = 'Cape Verde'
    if (tz == -2):
        row['city'] = 'Nuuk'
        row['country'] = 'Greenland'
    if (tz == -3):
        row['city'] = 'São Paulo'
        row['country'] = 'Brazil'
    if (tz == -4):
        row['city'] = 'St. John\'s'
        row['country'] = 'Canada'
    if (tz == -5):
        row['city'] = 'New York'
        row['state'] = 'NY'
        row['country'] = 'United States'
    if (tz == -5.3):
        row['city'] = 'Indianapolis'
        row['state'] = 'IN'
        row['country'] = 'United States'
    if (tz == -6):
        row['city'] = 'Chicago'
        row['state'] = 'IL'
        row['country'] = 'United States'
    if (tz == -7):
        row['city'] = 'Phoenix'
        row['state'] = 'AZ'
        row['country'] = 'United States'
    if (tz == -8):
        row['city'] = 'Redmond'
        row['state'] = 'WA'
        row['country'] = 'United States'
    if (tz == -9):
        row['city'] = 'Anchorage'
        row['state'] = 'AK'
        row['country'] = 'United States'
    if (tz == -13):
        row['city'] = 'Sydney'
        row['country'] = 'Australia'
    
    return row

# Now let's make up a city for each time zone
df_unified = df_unified.apply(get_city, axis=1)
df_unified.head()

Unnamed: 0,city,country,email,name,state,timezone,timezone_hours
0,Bangkok,Thailand,rlander@microsoft.com,Rich Lander,,25200,7.0
1,Bangkok,Thailand,piotrp@microsoft.com,Piotr Puszkiewicz,,25200,7.0
2,Bangkok,Thailand,mnaggaga@microsoft.com,Maria Naggaga Nakanwagi,,25200,7.0
3,Bangkok,Thailand,maria.naggaga@live.ca,LadyNaggaga,,25200,7.0
6,Abu Dhabi,United Arab Emirates,maria.naggaga@live.ca,LadyNaggaga,,14400,4.0


In [10]:
# Finally, lets export this as AuthorLocations.csv
df_unified.to_csv('AuthorLocations.csv')