## Loading Candidate Labels

In [92]:
# Build an array of folders inside of the data/input directory
import os

folders = os.listdir('data/input')
folders

['mlnet', 'wherewolf', 'emergence', 'dotnetinteractive', 'gitstractor']

In [93]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/input/' + folders[0] + '/ClassifiedCommits.csv')
df['Source'] = folders[0]
df.head()

Unnamed: 0,Sha,Message,IsBugFix,Message.1,Response,Source
0,f0e639af5ffdc839aae8e65d19b5a9a1f0db634a,Initial commit,False,Initial commit with no modifications,"{""IsBugFix"": false, ""Reason"": ""Initial commit ...",mlnet
1,76cb2cdf5cc8b6c88ca44b8969153836e589df04,Get a working build (#1),True,Aim to get a 'working build' usually indicates...,"{""IsBugFix"": true, ""Reason"": ""Aim to get a 'wo...",mlnet
2,972f6232de173b5e294a34a847682e9b1e67d3af,Fixed the syntax of cited example. (#2),True,Fixed syntax error related to cited example,"{""IsBugFix"": true, ""Reason"": ""Fixed syntax err...",mlnet
3,cde0d7d18ec9e93bde1d3a53c35f87430ac43fee,Add ML.NET Roadmap (#30),False,Documenting feature roadmap is not considered ...,"{""IsBugFix"": false, ""Reason"": ""Documenting fea...",mlnet
4,da023b18464268a9d4431de1f978a685fa397a8b,Merge pull request #35 from shauheen/update_co...,False,Pull request for merging does not necessarily ...,"{""IsBugFix"": false, ""Reason"": ""Pull request fo...",mlnet


In [94]:
# Loop over the remaining folders and append them into the DataFrame, including a column with the source folder
for folder in folders[1:]:
    df_temp = pd.read_csv('data/input/' + folder + '/ClassifiedCommits.csv')
    df_temp['Source'] = folder

    # Append the new DataFrame to the original one
    df = pd.concat([df, df_temp])

df['Source'].value_counts()

Source
dotnetinteractive    3753
mlnet                2587
emergence             172
gitstractor            42
wherewolf              22
Name: count, dtype: int64

In [95]:
df.columns

Index(['Sha', 'Message', 'IsBugFix', 'Message.1', 'Response', 'Source'], dtype='object')

In [96]:
df.rename(columns={'Message.1': 'Reasoning'}, inplace=True)

In [97]:
df.sample(5)

Unnamed: 0,Sha,Message,IsBugFix,Reasoning,Response,Source
2545,f6ddd9b7dbb4bf500ded577195ddd1320c0e8761,Updated for https://dev.azure.com/dnceng/7ea91...,True,Used baselines to update and possibly correct ...,"{""IsBugFix"": true, ""Reason"": ""Used baselines t...",mlnet
1104,ce0c917d1d8e44a5460f984d667107eecc4a9f78,Fixed a tensorflow test which was marked as sk...,True,Specific commit aimed at fixing a skipped test...,"{""IsBugFix"": true, ""Reason"": ""Specific commit ...",mlnet
467,d248ae25777b7511676427e2427738b727e2f13d,Merge pull request #13 from jonsequitur/packag...,False,A merge pull request suggests integration rath...,"{""IsBugFix"": false, ""Reason"": ""A merge pull re...",dotnetinteractive
3542,108ae4715c9104c2bc81e78f1879dfe5c7c49796,Update release.yml,False,Updating a configuration file does not imply f...,"{""IsBugFix"": false, ""Reason"": ""Updating a conf...",dotnetinteractive
644,e92d3fcabd3712ed67e40a5ca2b538df2fc512e3,exception handling for tests,True,The commit message suggests adding exception h...,"{""IsBugFix"": true, ""Reason"": ""The commit messa...",dotnetinteractive


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6576 entries, 0 to 41
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sha        6576 non-null   object
 1   Message    6576 non-null   object
 2   IsBugFix   6328 non-null   object
 3   Reasoning  6576 non-null   object
 4   Response   6576 non-null   object
 5   Source     6576 non-null   object
dtypes: object(6)
memory usage: 359.6+ KB


In [99]:
import plotly.express as px

# Specify defaults for Plotly settings
px.defaults.color_continuous_scale = px.colors.sequential.Agsunset
px.defaults.color_discrete_sequence = px.colors.qualitative.Prism
#px.defaults.template = 'plotly_dark'
px.defaults.template = 'plotly_white'

In [100]:
px.histogram(df, x='Source', title='Commits per repository').update_xaxes(categoryorder='total descending')

In [101]:
px.histogram(df.fillna('Missing'), 
             x='IsBugFix', 
             color='Source', 
             title='Number of Bug Fixes per Repository', 
             labels={'IsBugFix': 'Is Bug Fix', 'Source': 'Repository'})

In [102]:
df[df['IsBugFix'].isnull()].head()['Reasoning']

59     {"IsBugFix": true, "Reason": "Code modificatio...
100    {"IsBugFix": true, "Reason": "Addressing a nam...
151    {"IsBugFix": true, "Reason": "The message indi...
163    {"IsBugFix": true, "Reason": "The commit messa...
164    {"IsBugFix": true, "Reason": "Introducing cons...
Name: Reasoning, dtype: object

In [103]:
df_cleaned = df.dropna(subset=['IsBugFix'])
df_cleaned['IsBugFix'].value_counts()

IsBugFix
True     4492
False    1836
Name: count, dtype: int64

In [104]:
px.histogram(df_cleaned, 
             x='IsBugFix', 
             color='Source', 
             title='Number of Bug Fixes per Repository', 
             labels={'IsBugFix': 'Is Bug Fix', 'Source': 'Repository'})

In [105]:
df_cleaned.drop(columns=['Response'], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [106]:
# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('data/ClassifiedCommits.csv', index=False)

## Adding Commit Details

In [107]:
# Loop over all commits.csv and aggregate them into a central DataFrame
df_commits = pd.read_csv('data/input/' + folders[0] + '/Commits.csv', parse_dates=['AuthorDateUtc','CommitterDateUtc'])
df_commits['Source'] = folders[0]

for folder in folders[1:]:
    # Load the csv file and treat AuthorDateUtc as a datetime
    df_temp = pd.read_csv('data/input/' + folder + '/Commits.csv', parse_dates=['AuthorDateUtc','CommitterDateUtc'])
    df_temp['Source'] = folder

    # Append the new DataFrame to the original one
    df_commits = pd.concat([df_commits, df_temp])

df_commits.sample(5)

Unnamed: 0,Sha,ParentSha,Parent2Sha,IsMerge,AuthorId,AuthorDateUtc,CommitterId,CommitterDateUtc,Message,Work Items,Total Files,Modified Files,Added Files,Deleted Files,Total Lines,Net Lines,Added Lines,Deleted Lines,Source
3040,a297fd05b2f4ddaeace783cb749c38bdac14b572,873526734495de1d65758b1963cdb37a26eb2851,,False,14,2023-01-24 18:13:16,14,2023-01-24 21:50:42,decimal formatting,0,2,2,0,0,1738,14,21,7,dotnetinteractive
2020,091bddfb84c66524d542663ecc9b1b7714ad6e05,bb13d629000c218136e741b643767cf45ae12fc4,,False,117,2020-06-26 16:30:07,7,2020-06-26 16:30:07,Changed default NGram length from 1 to 2. (#5248),1,9,9,0,0,32819,9,280,271,mlnet
111,0129074bb12ede9f3ead82782fceef5c3230eff4,ac6c5078e132c7ec3225b0d7a76e9b08057d4697,,False,13,2019-06-03 20:27:09,14,2019-06-06 19:57:07,use the same method to configure the urls,0,3,2,1,0,255,18,30,12,dotnetinteractive
2490,2afdff08f01cb7f565e7c603ed69e046deb7d3b1,36bdf13323b2c86dc5a858715a75c13b660ae2e1,,False,10,2021-11-18 22:54:34,2,2021-11-18 22:54:34,Update bug_report.md,0,1,1,0,0,39,-2,2,4,dotnetinteractive
1919,35a5cfc77cca701d679c1dccf91000ad3604a01c,d9d9169dc79e89b7150d44a7a9c08a4233a0d3d4,,False,8,2021-03-22 17:24:07,2,2021-03-22 17:24:07,publish npm package to specified feed (#1168),1,6,4,2,0,584,99,103,4,dotnetinteractive


In [108]:
df_commits.to_csv('data/Commits.csv', index=False)

In [109]:
# Merge the two DataFrames on the commit hash
df_merged = pd.merge(df_cleaned, df_commits, on='Sha')
df_merged.sample(5)

Unnamed: 0,Sha,Message_x,IsBugFix,Reasoning,Source_x,ParentSha,Parent2Sha,IsMerge,AuthorId,AuthorDateUtc,...,Work Items,Total Files,Modified Files,Added Files,Deleted Files,Total Lines,Net Lines,Added Lines,Deleted Lines,Source_y
258,d6be6dcf16bcfc5e9b18dbe697bf1d5b920b9856,Remove SubComponent usage from ML.PipelineInfe...,True,Removing a component could be related to fixin...,mlnet,44c6e902268d7877eba431c3f4973be0ec2231e0,,False,4,2018-09-08 13:15:57,...,1,2,2,0,0,1806,7,9,2,mlnet
597,e43bba1d945d93b08f5ed12ee3c4f2540fef821f,Updating the XML Docs for Permutation Feature ...,True,Updating documentation does not directly fix a...,mlnet,3bf74edf8dce3053c017442b0326a62ab4756049,,False,36,2018-11-27 23:27:12,...,1,1,1,0,0,134,27,30,3,mlnet
2695,459419eac15e1f88cbdeb73ab1817239fc5cca09,Merge pull request #1 from IntegerMan/TestBranch,False,Merging PRs and the nature of changes,emergence,3132e52e0bc0c40c1390e8655c7e4cb7ad4c3970,c6f0ea0b6c60b7046c9caec7fc5df568eda660bc,True,1,2019-06-24 04:13:07,...,1,7,5,2,0,299,25,52,27,emergence
1733,c819d77e9250c68883713d5f1cd79b8971a11faf,LDSVM trainer (#4060),True,An LDSVM trainer issue fix is typically consid...,mlnet,cdd309ebde2bf54592cf4246f5ba29147b9c7704,,False,22,2020-02-05 13:16:51,...,1,25,7,18,0,40090,4639,4686,47,mlnet
3792,214bf9d04cfca714c79e7a6e377701f354c05754,Update Dockerfile,False,Updating a Dockerfile does not typically indic...,dotnetinteractive,dec290c136c3161d0e723670bfc43e096258cf58,,False,10,2020-05-24 22:14:54,...,0,1,1,0,0,76,0,1,1,dotnetinteractive


In [110]:
df_merged.columns

Index(['Sha', 'Message_x', 'IsBugFix', 'Reasoning', 'Source_x', 'ParentSha',
       'Parent2Sha', 'IsMerge', 'AuthorId', 'AuthorDateUtc', 'CommitterId',
       'CommitterDateUtc', 'Message_y', 'Work Items', 'Total Files',
       'Modified Files', 'Added Files', 'Deleted Files', 'Total Lines',
       'Net Lines', 'Added Lines', 'Deleted Lines', 'Source_y'],
      dtype='object')

In [111]:
df_merged.drop(columns=['Source_y', 'Message_y'], inplace=True)
df_merged.rename(columns={'Source_x': 'Source', 'Message_x': 'Message'}, inplace=True)
df_merged.sample(5)

Unnamed: 0,Sha,Message,IsBugFix,Reasoning,Source,ParentSha,Parent2Sha,IsMerge,AuthorId,AuthorDateUtc,...,CommitterDateUtc,Work Items,Total Files,Modified Files,Added Files,Deleted Files,Total Lines,Net Lines,Added Lines,Deleted Lines
5817,e4a6f9a8ed19c23826ba61f51cc09e0f1966af6d,Localized file check-in by OneLocBuild Task: B...,False,The commit message indicates a check-in of loc...,dotnetinteractive,042a7c70e8b69190e11b8cf25e87312e8225267f,,False,7,2023-05-05 14:29:59,...,2023-05-05 15:37:28,0,28,15,13,0,577,104,562,458
1650,6ae3a3f42605efad43a483c8bc2bf46b5376d144,Expression estimator/transformer (#4548),True,The commit message involves a new feature (est...,mlnet,65e6acd9b127037eca99a71692de6f81d883ec0e,,False,22,2019-12-26 18:27:57,...,2019-12-26 18:27:57,1,58,3,55,0,23415,21908,21908,0
3855,5ef2520e22325fe23f021b0ba6549ddb7bb4cf97,event namespace configurable,True,Adjustment to an event namespace is indicative...,dotnetinteractive,1485fabdd8a9d0808ff6cf3c94dbe1a5014d76bc,,False,10,2020-06-10 15:12:13,...,2020-06-10 16:13:55,0,2,2,0,0,719,9,12,3
5334,94682e92bc917826c79fedda09d67df6dc788db8,fix warning,True,The commit addresses a warning which suggests ...,dotnetinteractive,e7f2d61c72674f1d91235c7305d9288b3f625db1,,False,6,2022-08-26 00:27:37,...,2022-08-26 09:19:26,0,1,1,0,0,236,1,1,0
4633,a328e3ddf9d68fc471ee1e32d634abbc569b4165,update tool versions (#1252),False,Updating tool versions is not considered a fix...,dotnetinteractive,1c35193ba2c459e516a38ed36b95f29345eb659f,,False,8,2021-04-15 19:17:25,...,2021-04-15 19:17:25,1,2,2,0,0,784,0,2,2


In [112]:
# Get the new york time zone
import pytz

# Adjust from UTC to US Eastern
df_merged['AuthorDateUtc'] = df_merged['AuthorDateUtc'].dt.tz_localize('UTC').dt.tz_convert(pytz.timezone('America/New_York') )

In [113]:
# Set up common aliases for Plotly
common_labels={'IsBugFix': 'Is Bug Fix', 'Source': 'Repository', 'AuthorDateUtc': 'Date'}
common_hover_info = ['Message', 'Source', 'Modified Files', 'Added Files', 'Deleted Files', 'Added Lines', 'Deleted Lines', 'Net Lines', 'IsBugFix']

In [114]:
px.box(df_merged, 
       x='Net Lines',
       hover_data=common_hover_info, 
       labels=common_labels,
       title='Net Lines of Code per Repository')

In [115]:
px.box(df_merged, 
       x='Net Lines', 
       y='Source', 
       color='Source',
       labels=common_labels,
       hover_data=common_hover_info, 
       title='Net Lines of Code per Repository')

In [116]:
px.box(df_merged, 
       x='Net Lines',
       y='IsBugFix',
       color='IsBugFix',
       labels=common_labels,
       hover_data=common_hover_info, 
       title='Net Lines per Repository by Bugfix / Non-Bugfix')

In [117]:
px.histogram(df_merged, 
             x='AuthorDateUtc', 
             color='Source', 
             labels=common_labels, 
             title='Commits per Repository over Time')


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [118]:
px.histogram(df_merged, 
             x='AuthorDateUtc', 
             color='IsBugFix', 
             labels=common_labels, 
             title='Commits by Bugfix Status over Time')


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [119]:
# Engineer day of week, month, quarter, and year from the AuthorDateUtc
df_merged['DayOfWeek'] = df_merged['AuthorDateUtc'].dt.day_name()
df_merged['Month'] = df_merged['AuthorDateUtc'].dt.month_name()
df_merged['Quarter'] = df_merged['AuthorDateUtc'].dt.quarter
df_merged['Year'] = df_merged['AuthorDateUtc'].dt.year
df_merged['Hour'] = df_merged['AuthorDateUtc'].dt.hour
df_merged['TimeOfDay'] = pd.cut(df_merged['Hour'], bins=[0,6,12,18,24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])

In [127]:
df_merged['DayOfWeek'].value_counts()

DayOfWeek
Tuesday      1273
Thursday     1221
Wednesday    1200
Monday       1141
Friday       1052
Saturday      267
Sunday        174
Name: count, dtype: int64

In [136]:
# Order the days of the week in sequential order
df_merged['DayOfWeek'] = pd.Categorical(df_merged['DayOfWeek'], categories=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'], ordered=True)
px.histogram(df_merged.sort_values('DayOfWeek'),
             x='DayOfWeek', 
             color='IsBugFix', 
             labels=common_labels, 
             title='BugFix Commits by Day of Week')

In [139]:
df_merged['Is Weekend'] = df_merged['DayOfWeek'].isin(['Saturday', 'Sunday'])

px.histogram(df_merged, x='IsBugFix', color='Is Weekend', title='Bugfix Commits by Weekend / Weekday')

In [146]:
px.histogram(df_merged.sort_values('Quarter'), x='IsBugFix', color='Quarter', title='Bugfix Commits by Quarter',
             color_discrete_sequence=px.colors.sequential.Agsunset_r)

In [150]:
# Change month to be ordinal categorical
df_merged["Month"] = pd.Categorical(df_merged["Month"], categories=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], ordered=True)

px.histogram(df_merged.sort_values('Month'), 
             x='Month', 
             color='IsBugFix', 
             title='Bugfix Commits by Month')

In [151]:
px.histogram(df_merged.sort_values('Month'), 
             x='Month', 
             color='Source', 
             title='Commits by Month by Repository')

In [156]:
px.histogram(df_merged.sort_values('Hour'),
             x='Hour', 
             color='IsBugFix', 
             labels=common_labels, 
             title='BugFix Commits by Hour')

In [157]:
px.histogram(df_merged.sort_values('Hour'),
             x='Hour', 
             color='Source', 
             labels=common_labels, 
             title='Commits by Hour by Repository')

In [154]:
df_merged['TimeOfDay'] = pd.Categorical(df_merged['TimeOfDay'], categories=['Morning', 'Afternoon', 'Evening', 'Night'], ordered=True)

px.histogram(df_merged.sort_values('TimeOfDay'), 
             x='TimeOfDay', 
             color='Source', 
             title='Commits by Time of Day by Repository')

In [155]:
px.histogram(df_merged.sort_values('TimeOfDay'), 
             x='TimeOfDay', 
             color='IsBugFix', 
             title='Commits by Time of Day by BugFix Status')

In [159]:
df_merged.columns

Index(['Sha', 'Message', 'IsBugFix', 'Reasoning', 'Source', 'ParentSha',
       'Parent2Sha', 'IsMerge', 'AuthorId', 'AuthorDateUtc', 'CommitterId',
       'CommitterDateUtc', 'Work Items', 'Total Files', 'Modified Files',
       'Added Files', 'Deleted Files', 'Total Lines', 'Net Lines',
       'Added Lines', 'Deleted Lines', 'DayOfWeek', 'Month', 'Quarter', 'Year',
       'Hour', 'TimeOfDay', 'Is Weekend'],
      dtype='object')

In [168]:
df_corr = df_merged.drop(columns=['Sha', 'ParentSha', 'Parent2Sha', 'Message', 'Reasoning'])

# Replace df_corr's month, day of week, and time of day with numeric values since they can be viewed as sequences
df_corr['Month'] = df_corr['Month'].cat.codes
df_corr['DayOfWeek'] = df_corr['DayOfWeek'].cat.codes
df_corr['TimeOfDay'] = df_corr['TimeOfDay'].cat.codes

df_corr = pd.get_dummies(df_corr, columns=['Source'])
corr = df_corr.corr()

mask = np.triu(np.ones_like(corr, dtype=bool), k=1)

import plotly.graph_objects as go

# Create a masked correlation matrix
masked_corr = corr.mask(mask)
#masked_corr = masked_corr.iloc[::-1,::-1]

heat = go.Heatmap(z=masked_corr,
                  x=df_corr.columns,
                  y=df_corr.columns,
                  xgap=1, ygap=1,
                  colorbar_thickness=20,
                  colorbar_ticklen=3,
                   )


title = 'Correlation Matrix'               

layout = go.Layout(title_text=title, title_x=0.5, 
                   height=800,
                   xaxis_showgrid=False,
                   yaxis_showgrid=False,
                   yaxis_autorange='reversed')
   
fig=go.Figure(data=[heat], layout=layout)        
fig.show() 

## Sample Selection

In [120]:
# Create a representative sample of the DataFrame making sure to stratify by the IsBugFix column
# Do this in a loop while we don't have at least 6 samples from each different source
# This takes some iteration because some of the sources are very much minority cases, but we want to ensure representation
while True:
    df_sample = df_merged.groupby(['IsBugFix']).apply(lambda x: x.sample(250)).reset_index(drop=True)
    if df_sample['Source'].nunique() >= len(folders) and df_sample['Source'].value_counts().min() >= 6:
        break

df_sample['IsBugFix'].value_counts()

IsBugFix
False    250
True     250
Name: count, dtype: int64

In [121]:
df_sample['Source'].value_counts()

Source
dotnetinteractive    295
mlnet                178
emergence             12
gitstractor            8
wherewolf              7
Name: count, dtype: int64

In [123]:
px.histogram(df_sample, x='Source', color='IsBugFix', title='Sampled Commits per Repository').update_xaxes(categoryorder='total descending')