# Importing data + Setup

The easiest way to execute this project is simply run all cells. The datasets are included in the folder contained in the same zip-archive.

The datasets used for training and inference can be alternated in the cell below.

In [None]:
import os
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)          # stops pandas from truncating columns

In [None]:
print(os.listdir("datasets/"))  
openstackDataset = pd.read_csv("datasets/openstack_metrics.csv")
qtDataset = pd.read_csv("datasets/qt_metrics.csv")

In [None]:
# pick datasets for training and inference testing, this allows for easy switching between them
trainingDataset = openstackDataset
inferenceDataset = qtDataset

In [None]:
# removing unnecessary attributes early since they hinder analysis-functions like correlation matrices
trainingDataset = trainingDataset.drop(['commit_id', 'author_date'], axis=1)   
inferenceDataset = inferenceDataset.drop(['commit_id', 'author_date'], axis=1) 

# Understanding the data

### Understanding each feature of the datasets

In [None]:
#0 commit_id :      unique id for the commited code                                #16 ndev :   N of devs that have modied the file in the past
#1 author_date :    id for the author of the commit                                #17 age :    time since previous update to the file
#2 bugcount :       amount of bugs detected                                        #18 nuc :    N of prior(unique) changes to the file
#3 fixcount :       amount of fixes                                                #19 app :    N of reviewers who voted on integration
#4 la :             lines added                                                    #20 aexp :   N of changes author has participated in
#5 ld :             lines deleted                                                  #21 rexp :   aexp but reviewer instead of author
#6 nf :             modified files                                                 #22 oexp :   o?
#7 nd :             modified directories                                           #23 arexp :  aexp but weighted by recency of changes
#8 ns :             modified subsystems                                            #24 rrexp :  arexp but reviewer instead of author
#9 ent :            entropy, spread of modified lines across files                 #25 orexp :  o?
#10 revd :          reviewed/revised? revd=False -> rxxx=NaN                       #26 asexp :  N of changes within the subsystem from the author
#11 nrev :          N of revisions to the commit                                   #27 rsexp :  asexp but reviewer instead of author
#12 rtime :         time between commit and approval                               #28 osexp :  o?
#13 tcmt :          total comments on the commit?                                  #29 asawr :  Proportion of previous changes to the subsystem from the author
#14 hcmt :          N of non-automated comments during review                      #30 rsawr :  asawr but reviewer instead of author
#15 self :          only self-checked? inverse of revd                             #31 osawr :  o?

In [None]:
trainingDataset.sample(3)

In [None]:
inferenceDataset.sample(3)

### Visualising and analysing the datasets

In [None]:
print("Shape of training dataset:", trainingDataset.shape)
print("Shape of training dataset:", inferenceDataset.shape)

In [None]:
#data.isna().sum()           # sums the amount of NaN values for each attribute
#data.describe()             # basic statistics for each attribute
#data.nunique()              # amount of unique values for each attribute
trainingDataset.info()

In [None]:
inferenceDataset.info()

In [None]:
print("Amount of instances where the commit was reviewed/revised in training dataset:", trainingDataset.revd.value_counts())
print("Amount of instances where the commit was reviewed/revised in inference dataset:", inferenceDataset.revd.value_counts())

In [None]:
print("Spread of bugs found in the training dataset (does not include instances of NaN)")
sns.countplot(x='bugcount', data=trainingDataset)             

In [None]:
bugsFoundInstances = trainingDataset.bugcount.value_counts().sum() - trainingDataset.bugcount.value_counts().get(0.0)

print("Amount of instances of bugcount containing NaN:", trainingDataset.bugcount.isna().sum())
print(f"Amount of times bugs were detected: {bugsFoundInstances} out of {trainingDataset.shape[0]} instances")

### Analysing correlation between features

In [None]:
correlation_matrix = trainingDataset.corr()
high_correlations = correlation_matrix[(correlation_matrix > 0.9) | (correlation_matrix < -0.9)]

# Print correlation for each combination of features
for col in high_correlations.columns:
    # filter out correlation values for columns with themselves by setting them to NaN
    high_correlations.loc[col, col] = np.nan
    # Drop all correlations that are set as NaN
    correlations = high_correlations[col].dropna()

    if not correlations.empty:
        print(f"Correlations with {col}:")
        print(correlations)
        print()

In [None]:
trainingDataset.corr()['revd']

In [None]:
print("Amount of False-values in revd:", (trainingDataset.revd == False).sum())
print(trainingDataset[trainingDataset['revd'] == 0].isna().sum())

# Cleaning the data

### Notes on replacing NaN values

Instances of NaN in bugcount/fixcount are assumed to imply that 0 bugs/fixes were recorded.

Based on correlation analysis of ['revd'], the remaining instances of NaN are all from features related to reviews/revisions of the commit
 
These instances are NaN iff ['revd'] = False, implying no review was made to the commit -> all review related features are therefore NaN

Replacing NaN with 0 is not equally suitable for all of these features, but is done for simplicity, example:

* ['app'] = 0, implies that no reviewers voted on integration when there was no review to vote on, this seems suitable

* ['aexp'] = 0, implies the author has participated in 0 previous changes, which is not necessarily true, not so suitable

Given more time, this could be better adapted and customized to the dataset

### Cleaning of training dataset

In [None]:
# Removing all "r" and "o" feature-types
# they contain highly correlated data, increases dimensional complexity, and rank very low on feature importance when training
unnecessaryColumns = ['arexp', 'self', 'revd', 'rexp', 'rrexp', 'rsexp', 'rsawr', 'oexp', 'orexp', 'osexp', 'osawr', 'asawr', 'asexp', 'aexp']
trainingDataset = trainingDataset.drop(unnecessaryColumns, axis=1)

# For the remaining features containing NaN, NaN are assumed to imply 0, see argument above
trainingDataset = trainingDataset.fillna(0.0)              

# turn bugcount into binary feature instead of multi-class 
trainingDataset['bugcount'] = trainingDataset['bugcount'].apply(lambda x: 1 if x != 0 else 0)

### Repeated cleaning for inference dataset

In [None]:
inferenceDataset = inferenceDataset.drop(unnecessaryColumns, axis=1)
inferenceDataset = inferenceDataset.fillna(0.0) 

inferenceDataset['bugcount'] = inferenceDataset['bugcount'].apply(lambda x: 1 if x != 0 else 0)

# extract target feature ['bugcount'] for inference testing
inference_bugcount = inferenceDataset.bugcount     
inference_set = inferenceDataset.drop(['bugcount'], axis=1) 

# Modelling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
# import data and extract target feature before balancing 
training_data = trainingDataset
target_feature = training_data.bugcount
training_data = training_data.drop(['bugcount'], axis=1)

# show class ratio
print(target_feature.value_counts())

#### Standardizing features and splitting datasets

In [None]:
# splitting dataset into test and train sets
x_train, x_test, y_train, y_test = train_test_split(training_data, target_feature, test_size=0.1, random_state=100)
features_names = x_train.columns

In [None]:
# Standardizing features to values [-1, 1], can help some models accurately depict features during training
# Example: SVM models are sensitive to the scale of features, especially SVMs with linear kernels. 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# calculates variance and mean for each feature before transforming
x_train = sc.fit_transform(x_train) 
# transforms features using the same variance and mean on both test and inference sets
x_test = sc.transform(x_test)  
inference_set = sc.transform(inference_set)

# transform numpyarray back into dataframe with correct feature names
x_train = pd.DataFrame(x_train, columns=features_names)
x_test = pd.DataFrame(x_test, columns=features_names)
inference_set = pd.DataFrame(inference_set, columns=features_names)

#### Synthetic Minority Oversampling Technique (SMOTE)

Bugcount is vastly imbalanced, this is solved by increasing the ratio of instances with found bugs. 

* Oversampling: SMOTE chooses random samples of minority class, finds similar instances of data using k-nearest neighbor and generates a new instance between the original sample and one random neighbor.

* Undersampling: Randomly undersamples data with majority class, in this case instances were bugcount = 0

In [None]:
# prepare pipeline for balancing
# multiple different ratios for sampling strategies can be tested
oversample = SMOTE(sampling_strategy=0.5)                        
undersample = RandomUnderSampler(sampling_strategy=1)
steps = [('over', oversample), ('under', undersample)]
pipeline = Pipeline(steps=steps)

# balance bugcount class
x_train, y_train = pipeline.fit_resample(x_train, y_train)
# show new balanced class ratio
print(y_train.value_counts())

## Training and validation

#### RandomForestClassifier model, validated using k-fold Cross Validation

In [None]:
# specify classifier model
rfc_model = RandomForestClassifier(n_estimators=25)     #25->100, marginally better but large impact on compute

# define k-fold strategy and perform crossvalidation
crossValidation = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)     # 10 splits is a good bias-variance balance
scores = cross_val_score(rfc_model, x_train, y_train, scoring='f1_macro', cv=crossValidation, n_jobs=-1)   
print(f"CrossVal mean f1_macro: {np.mean(scores): .3f}")         # f1_macro scores adds weights to results from each class

# train RandomForestClassifier model
rfc_model.fit(x_train, y_train)

# inference on test set
rfc_predictions = rfc_model.predict(x_test)
print(f"Trained model f1-score: {f1_score(y_test, rfc_predictions): .3f}")

#### Support Vector Classifier model

In [None]:
# kernel alternatives: 'linear', 'rbf', 'poly' and 'sigmoid'
svm_model = SVC(kernel='rbf')    
svm_model.fit(x_train, y_train)
svm_predictions = svm_model.predict(x_test)
svm_f1 = f1_score(y_test, svm_predictions)
print("SVM f1 score:", svm_f1)

#### Naive Bayes model, GaussianNB

In [None]:
# This type of model also performs much better using standardized features
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
nb_predictions = nb_model.predict(x_test)
nb_f1 = f1_score(y_test, nb_predictions)
print("Naive Bayes f1 score:", nb_f1)

# Results

### Model performance from training

In [None]:
# Metrics for RandomForestClassifier
print(classification_report(y_test, rfc_predictions, target_names=['no bugs', 'contains bugs']))

In [None]:
# Metrics for SupportVectorMachine
print(classification_report(y_test, svm_predictions, target_names=['no bugs', 'contains bugs']))

In [None]:
# Metrics for GaussianNB
print(classification_report(y_test, nb_predictions, target_names=['no bugs', 'contains bugs']))

### Model performance on inference dataset

In [None]:
# Model alternatives: rfc_model, svm_model or nb_model
rfc_inference = rfc_model.predict(inference_set)
print(classification_report(inference_bugcount, rfc_inference, target_names=['no bugs', 'contains bugs'])) 

In [None]:
svm_inference = svm_model.predict(inference_set)
print(classification_report(inference_bugcount, svm_inference, target_names=['no bugs', 'contains bugs'])) 

In [None]:
nb_inference = nb_model.predict(inference_set)
print(classification_report(inference_bugcount, nb_inference, target_names=['no bugs', 'contains bugs'])) 

# Analysis

#### Calculate baseline: what are the odds of finding a bug in general?

In [None]:
# Baseline prediction for different datasets: for training_data use: target_feature | for inference_data use: inference_bugcount 
count_zero = inference_bugcount.value_counts().get(0)
count_one = inference_bugcount.value_counts().get(1)
totalCount = count_one + count_zero
chance_zero = count_zero / totalCount
chance_one = count_one / totalCount
randomGuessAccuracy = (chance_zero + chance_one) / 2
weightedGuessAccuracy = (chance_one * chance_one) + (chance_zero * chance_zero)
print(f"Bugcount values, 0: {count_zero}, 1: {count_one}")
print(f"Probability of majority class, p(0): {chance_zero:.3f}")
print(f"Propability of minority class, p(1): {chance_one:.3f}")
print(f"Accuracy of a random guess: {randomGuessAccuracy:.3f}")
print(f"Accuracy of a weighted guess: {weightedGuessAccuracy:.3f}")

#### Feature importance from training

In [None]:
ConfusionMatrixDisplay.from_estimator(rfc_model, x_test, y_test) 

In [None]:
feature_importance = pd.Series(rfc_model.feature_importances_, index=x_train.columns).sort_values(axis = 0, ascending = False)
feature_importance