In [28]:
import pandas as pd

In [29]:
df_file_commits = pd.read_csv('FileCommits.csv')

# Fix the junk column to be an ID
df_file_commits.rename(columns={'Unnamed: 0': 'File_Commit_ID'}, inplace=True)

# Because our test data was cloned to a temp directory as part of PyDriller, let's substitute it with the correct local path
df_file_commits['relative_path'] = df_file_commits['new_path']

# Replace NaN values with '' for readability
df_file_commits.fillna('', inplace=True)

# Display top 5 rows
df_file_commits.head()

Unnamed: 0,File_Commit_ID,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,...,net_lines,num_files,branches,filename,old_path,new_path,project_name,project_path,parents,relative_path
0,0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,382168,868,main,.gitattributes,,.gitattributes,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,.gitattributes
1,1,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,382168,868,main,.gitignore,,.gitignore,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,.gitignore
2,2,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,382168,868,main,BuildToolsVersion.txt,,BuildToolsVersion.txt,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,BuildToolsVersion.txt
3,3,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,382168,868,main,CONTRIBUTING.md,,CONTRIBUTING.md,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,CONTRIBUTING.md
4,4,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,...,382168,868,main,Directory.Build.props,,Directory.Build.props,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,Directory.Build.props


In [30]:
df_files = pd.read_csv('FileSizes.csv')

# Pandas guesses at ID column names. Make the name make sense
df_files.rename(columns={'Unnamed: 0': 'File_ID'}, inplace=True)

# Replace '.' values (root directory) with '' instead
df_files.replace({
    'path': {
        '.': ''
    }
}, inplace=True)

# Engineer a relative path column using / as a separator only if not in the root directory
df_files['relative_path'] = df_files['path'].apply(lambda p: '' if p == '' else p + '/') + df_files['filename']

# These columns provide no additional information and muddy comparisons later
df_files.drop(columns=['root', 'fullpath'], inplace=True) 

# Display first 5 rows
df_files.head()

Unnamed: 0,File_ID,project,path,filename,ext,lines,relative_path
0,32,Microsoft.Data.Analysis,,PrimitiveDataFrameColumn.BinaryOperationAPIs.E...,.cs,15886,PrimitiveDataFrameColumn.BinaryOperationAPIs.E...
1,35,Microsoft.Data.Analysis,,PrimitiveDataFrameColumn.BinaryOperators.cs,.cs,7397,PrimitiveDataFrameColumn.BinaryOperators.cs
2,1249,Microsoft.ML.Core.Tests,UnitTests,TestEntryPoints.cs,.cs,6904,UnitTests/TestEntryPoints.cs
3,40,Microsoft.Data.Analysis,,PrimitiveDataFrameColumnArithmetic.cs,.cs,6863,PrimitiveDataFrameColumnArithmetic.cs
4,830,Microsoft.ML.OnnxConverter,,OnnxMl.cs,.cs,5945,OnnxMl.cs


In [35]:
# Create a new data frame by joining together the other two on their relative paths
df_merged = pd.merge(df_file_commits, df_files, left_on='relative_path', right_on='relative_path')

# Remove not needed and consolidate duplicated columns
df_merged.drop(columns=['File_Commit_ID', 'File_ID', 'filename_x'], inplace=True)
df_merged.rename(columns={'filename_y': 'filename'}, inplace=True)

# Save the resulting dataset to disk
df_merged.to_csv('MergedFileData.csv')

# Display the top 5 rows
df_merged.head()

Unnamed: 0,hash,message,author_name,author_email,author_date,author_tz,committer_name,committer_email,committer_date,committer_tz,...,new_path,project_name,project_path,parents,relative_path,project,path,filename,ext,lines
0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,dotnet-bot,dotnet-bot@microsoft.com,2018-05-03 17:22:00-07:00,25200,Immo Landwerth,immol@microsoft.com,2018-05-03 17:22:00-07:00,25200,...,build.sh,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,,build.sh,Native,,build.sh,.sh,179
1,76cb2cdf5cc8b6c88ca44b8969153836e589df04,Get a working build (#1)\n\n* Set missing exec...,Sandy Armstrong,sanfordarmstrong@gmail.com,2018-05-04 12:47:21-07:00,25200,Eric Erhardt,eric.erhardt@microsoft.com,2018-05-04 14:47:21-05:00,18000,...,build.sh,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,build.sh,Native,,build.sh,.sh,179
2,ed577123389bce76909b24ce7f7a5693549333de,Fixes build error when path contains space on ...,Morten,tincann@users.noreply.github.com,2018-05-30 18:36:52+02:00,-7200,Tom Finley,tfinley@gmail.com,2018-05-30 09:36:52-07:00,25200,...,build.sh,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,0233d710749c67bf8facfb98e5492fb2b70e95b4,build.sh,Native,,build.sh,.sh,179
3,7ba62c4cbb9b2355a1a09fec3e86e12fabaeb472,ci test build,Dmitry Akhutin,dmitrya@microsoft.com,2019-01-14 22:38:18-08:00,28800,Dmitry Akhutin,dmitrya@microsoft.com,2019-01-14 22:38:18-08:00,28800,...,build.sh,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,8481317689d06e47b82e3e9dac8b8ab873ef5109,build.sh,Native,,build.sh,.sh,179
4,9ea19bd2d3b5a443c441d6b1536bb03ddd676f80,"Make build.sh, init-tools.sh, & run.sh executa...",Justin Ormont,justinormont@users.noreply.github.com,2019-02-26 09:37:52-08:00,28800,srsaggam,41802116+srsaggam@users.noreply.github.com,2019-02-26 09:37:52-08:00,28800,...,build.sh,machinelearning,C:\Users\Admin\AppData\Local\Temp\tmpvezg8ml9\...,8463bae5e4a867968304d4bca57450f904cc285e,build.sh,Native,,build.sh,.sh,179
