In [7]:
import pandas as pd

In [8]:
df_file_commits = pd.read_csv('FileCommits.csv')

# Fix the junk column to be an ID
df_file_commits.rename(columns={'Unnamed: 0': 'File_Commit_ID'}, inplace=True)

# Because our test data was cloned to a temp directory as part of PyDriller, let's substitute it with the correct local path
df_file_commits['relative_path'] = df_file_commits['new_path']

# Replace NaN values with '' for readability
df_file_commits.fillna('', inplace=True)

# Display top 5 rows
df_file_commits.head()

Unnamed: 0,File_Commit_ID,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,...,num_deletes,num_inserts,net_lines,branches,filename,old_path,new_path,project_name,project_path,relative_path
0,0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,main,.gitattributes,,.gitattributes,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,.gitattributes
1,1,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,main,.gitignore,,.gitignore,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,.gitignore
2,2,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,main,BuildToolsVersion.txt,,BuildToolsVersion.txt,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,BuildToolsVersion.txt
3,3,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,main,CONTRIBUTING.md,,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md
4,4,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,0,382168,382168,main,Directory.Build.props,,Directory.Build.props,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,Directory.Build.props


In [9]:
df_files = pd.read_csv('FileSizes.csv')

# Pandas guesses at ID column names. Make the name make sense
df_files.rename(columns={'Unnamed: 0': 'File_ID'}, inplace=True)

# Replace '.' values (root directory) with '' instead
df_files.replace({
    'path': {
        '.': ''
    }
}, inplace=True)

# Engineer a relative path column using / as a separator only if not in the root directory
#df_files['relative_path'] = df_files['path'].apply(lambda p: '' if p == '' else p + '/') + df_files['filename']

# These columns provide no additional information and muddy comparisons later
df_files.drop(columns=['root', 'fullpath'], inplace=True) 

# Display first 5 rows
df_files.head()

Unnamed: 0,File_ID,project,path,relative_path,filename,ext,lines
0,1405,src,,src\Microsoft.ML.TorchSharp\Resourcesdict.txt,dict.txt,.txt,50260
1,2033,test,,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastForestRegre...,.txt,39544
2,2035,test,,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastTreeTweedie...,.txt,39534
3,2034,test,,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.FastTree.FastTreeRegress...,.txt,39524
4,2037,test,,test\BaselineOutput\Common\Onnx\Regression\Adu...,Microsoft.ML.Trainers.LightGbm.LightGbmRegress...,.txt,38484


In [10]:
# Create a new data frame by joining together the other two on their relative paths
df_merged = pd.merge(df_file_commits, df_files, left_on='relative_path', right_on='relative_path')

# Remove not needed and consolidate duplicated columns
df_merged.drop(columns=['File_Commit_ID', 'File_ID', 'filename_x'], inplace=True)
df_merged.rename(columns={'filename_y': 'filename'}, inplace=True)

# Save the resulting dataset to disk
df_merged.to_csv('MergedFileData.csv')

# Display the top 5 rows
df_merged.head()

Unnamed: 0,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,committer_tz,...,old_path,new_path,project_name,project_path,relative_path,project,path,filename,ext,lines
0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,25200,...,,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md,,,CONTRIBUTING.md,.md,40
1,979418886950e144b2cc561bdc5eb41d382cf829,Update contribution guide and issue/PR templates,Shauheen Zahirazami,shzahira@microsoft.com,2018-05-05 13:47:44-07:00,25200,Shauheen Zahirazami,shzahira@microsoft.com,2018-05-05 13:47:44-07:00,25200,...,CONTRIBUTING.md,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md,,,CONTRIBUTING.md,.md,40
2,cfcb6cbb08db5a2b1f84294763bd99828b88b2ea,Update CONTRIBUTING.md,Dan Moseley,danmose@microsoft.com,2018-05-06 20:47:00-07:00,25200,GitHub,noreply@github.com,2018-05-06 20:47:00-07:00,25200,...,CONTRIBUTING.md,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md,,,CONTRIBUTING.md,.md,40
3,2038108473015a58cee8425719f75751282cb7b0,Adjust gitter link to go to new mlnet room (#4...,Dan Moseley,danmose@microsoft.com,2018-05-07 08:54:00-07:00,25200,GitHub,noreply@github.com,2018-05-07 08:54:00-07:00,25200,...,CONTRIBUTING.md,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md,,,CONTRIBUTING.md,.md,40
4,616e75f779f66554e4fe22d14540be121061b383,Change Documentation folder to docs (#87) * C...,Ankit Asthana,aasthan@microsoft.com,2018-05-16 18:34:44-07:00,25200,Shauheen,shauheen@users.noreply.github.com,2018-05-16 21:34:44-04:00,14400,...,CONTRIBUTING.md,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpaoyft4wz\...,CONTRIBUTING.md,,,CONTRIBUTING.md,.md,40
