In [19]:
import pandas as pd

In [20]:
df_file_commits = pd.read_csv('FileCommits.csv')

# Fix the junk column to be an ID
df_file_commits.rename(columns={'Unnamed: 0': 'File_Commit_ID'}, inplace=True)

# Because our test data was cloned to a temp directory as part of PyDriller, let's substitute it with the correct local path
df_file_commits['relative_path'] = df_file_commits['new_path']

# Replace NaN values with '' for readability
df_file_commits.fillna('', inplace=True)

# Display top 5 rows
df_file_commits.head()

Unnamed: 0,File_Commit_ID,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,...,num_deletes,num_inserts,net_lines,branches,filename,old_path,new_path,project_name,project_path,relative_path
0,0,7894423f9bac837f4c5fb2c9a0f4284da38f2069,Initial commit,Rich Lander,rlander@microsoft.com,2017-09-21 16:11:36-07:00,25200,GitHub,noreply@github.com,2017-09-21 16:11:36-07:00,...,0,21,21,main,LICENSE,,LICENSE,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,LICENSE
1,1,42dd1a3280da0bf901058cd7812faa1355eaae29,Create README.md,Piotr Puszkiewicz,piotrp@microsoft.com,2017-09-21 16:22:28-07:00,25200,GitHub,noreply@github.com,2017-09-21 16:22:28-07:00,...,0,2,2,main,README.md,,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md
2,2,25139110fc53537334c2f2a745246b4fcf8203fb,Updated the readme,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,25200,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,...,1,10,9,main,README.md,README.md,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md
3,3,3a88efed0961f689e692eb3d52b3d9d3ddca903b,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:50:32-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:50:32-07:00,...,2,15,13,main,README.md,README.md,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md
4,4,0278d89a6150858193cee8e6d1ac0ce159ac4ad0,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:51:31-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:51:31-07:00,...,1,1,0,main,README.md,README.md,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md


In [21]:
df_files = pd.read_csv('FileSizes.csv')

# Pandas guesses at ID column names. Make the name make sense
df_files.rename(columns={'Unnamed: 0': 'File_ID'}, inplace=True)

# Replace '.' values (root directory) with '' instead
df_files.replace({
    'path': {
        '.': ''
    }
}, inplace=True)

# Engineer a relative path column using / as a separator only if not in the root directory
#df_files['relative_path'] = df_files['path'].apply(lambda p: '' if p == '' else p + '/') + df_files['filename']

# These columns provide no additional information and muddy comparisons later
df_files.drop(columns=['root', 'fullpath'], inplace=True) 

# Display first 5 rows
df_files.head()

Unnamed: 0,File_ID,project,path,area,relative_path,filename,ext,lines
0,88,samples,,notebooks,samples\notebooks\fsharp\Docs\Formatting-outpu...,Formatting-outputs.ipynb,.ipynb,2314
1,676,src,,Microsoft.DotNet.Interactive.Documents.Tests,src\Microsoft.DotNet.Interactive.Documents.Tes...,JupyterFormatTests.cs,.cs,1651
2,1105,src,,Microsoft.DotNet.Interactive.Tests,src\Microsoft.DotNet.Interactive.Tests\Languag...,LanguageKernelTests.cs,.cs,1222
3,1102,src,,Microsoft.DotNet.Interactive.Tests,src\Microsoft.DotNet.Interactive.Tests\Languag...,LanguageKernelPackageTests.cs,.cs,1174
4,806,src,,Microsoft.DotNet.Interactive.FSharp,src\Microsoft.DotNet.Interactive.FSharp\FsAuto...,TipFormatter.fs,.fs,1160


In [22]:
# Create a new data frame by joining together the other two on their relative paths
df_merged = pd.merge(df_file_commits, df_files, left_on='relative_path', right_on='relative_path')

# Remove not needed and consolidate duplicated columns
df_merged.drop(columns=['File_Commit_ID', 'File_ID', 'filename_x'], inplace=True)
df_merged.rename(columns={'filename_y': 'filename'}, inplace=True)

# Save the resulting dataset to disk
df_merged.to_csv('MergedFileData.csv')

# Display the top 5 rows
df_merged.head()

Unnamed: 0,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,committer_tz,...,new_path,project_name,project_path,relative_path,project,path,area,filename,ext,lines
0,42dd1a3280da0bf901058cd7812faa1355eaae29,Create README.md,Piotr Puszkiewicz,piotrp@microsoft.com,2017-09-21 16:22:28-07:00,25200,GitHub,noreply@github.com,2017-09-21 16:22:28-07:00,25200,...,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md,,,Root,README.md,.md,87
1,25139110fc53537334c2f2a745246b4fcf8203fb,Updated the readme,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,25200,Maria Naggaga Nakanwagi,mnaggaga@microsoft.com,2017-09-22 18:47:48-07:00,25200,...,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md,,,Root,README.md,.md,87
2,3a88efed0961f689e692eb3d52b3d9d3ddca903b,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:50:32-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:50:32-07:00,25200,...,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md,,,Root,README.md,.md,87
3,0278d89a6150858193cee8e6d1ac0ce159ac4ad0,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 18:51:31-07:00,25200,GitHub,noreply@github.com,2017-09-22 18:51:31-07:00,25200,...,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md,,,Root,README.md,.md,87
4,fcb49163e602da9235c05e0d6cdf7cac27b741f4,Update README.md,LadyNaggaga,maria.naggaga@live.ca,2017-09-22 20:53:15-07:00,25200,GitHub,noreply@github.com,2017-09-22 20:53:15-07:00,25200,...,README.md,interactive,C:\Users\Admin\AppData\Local\Temp\tmpxtrxdoy6\...,README.md,,,Root,README.md,.md,87
