In [237]:
import pandas as pd
import json

# DATASET IMPORT

# Load dataLinux: 10k builds from Linux Testers containing flaky runs and failure runs
dataLinux = pd.read_json('dataset.35past.Linux10k.json')

# Load dataPass: 2 builds from Linux Testers containing flaky runs, failure runs but more importantly all pass runs
dataPass = pd.read_json('dataset.pass.json')

# Load nft: List of Passing tests that are never found to be flaky or legit elsewhere
file121= open('nft-121.json')
file123= open('nft-123.json')
nft121 = json.load(file121)
nft123 = json.load(file123)
file121.close()
file123.close()

In [343]:
import numpy as np
from pprint import pprint
from collections import Counter
pd.set_option('display.max_colwidth', None)

# Get data about PASS (2), LEGIT (1) and FLAKY (0) run
dataPass = dataPass[(dataPass["label"] == 2) & (dataPass["testSource"] != "")]
dataFlaky = dataLinux[(dataLinux["label"] == 0) & (dataLinux["testSource"] != "")]
dataLegit = dataLinux[(dataLinux["label"] == 1) & (dataLinux["testSource"] != "")]

# Keep clean NFT
dataPass121 = dataPass[(dataPass["buildId"] == 121238) & (dataPass["testId"].isin(nft121))]
dataPass123 = dataPass[(dataPass["buildId"] == 123038) & (dataPass["testId"].isin(nft123))]
dataPasses = pd.concat([dataPass121, dataPass123])

# Building one set of pass legit and flaky
data = pd.concat([dataPasses, dataFlaky, dataLegit])
data["flakeRate"] = data["flakeRate"].fillna(0)
print("Data:", Counter(data["label"]))

# Split 80/20
dataTrain = data[data["buildId"] <= 121238]
dataTest = data[(data["buildId"] > 121238) & (data["buildId"] <= 123038)]


# TRAINING SET (Uncomment the RQ to check, comment the others)
# RQ 1
dataTrain = dataTrain.drop_duplicates(subset=["testSource", "label"], keep='first')
dataTrain["label"] = dataTrain["label"].map({0:1, 1:0, 2:0})

# RQ 2
# dataTrain["label"] = dataTrain["label"].map({0:1, 1:0, 2:0})

# RQ 3
# dataTrain["label"] = dataTrain["label"].map({0:1, 1:0, 2:0})


# TEST SET
dataTest = dataTest[(dataTest["label"] == 1) | (dataTest["label"] == 0)]
# Adapt labels: Flaky == 1, Legit == 0
dataTest["label"] = dataTest["label"].map({0:1, 1:0})

print("Data Train:", Counter(dataTrain["label"]))
print("Data Test:", Counter(dataTest["label"]))


Data: Counter({0: 1482476, 2: 306017, 1: 11643})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataTrain["label"] = dataTrain["label"].map({0:1, 1:0, 2:0})


Data Train: Counter({1: 1264973, 0: 187077})
Data Test: Counter({1: 217503, 0: 2320})


In [344]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, Binarizer

# Columns definition
testSource_col = "testSource"
cat_cols = ["testSuite"]
std_cols = ["flakeRate"]
other_cols = ["runDuration"]

# Feature transformation (Uncomment the RQ to check, comment the others)
# RQ1 or RQ2 
cols_trans = ColumnTransformer([
    ('testSource', CountVectorizer(max_features=100), testSource_col),
], remainder='drop')

# RQ 3 
# cols_trans = ColumnTransformer([
#     ('categories', OneHotEncoder(handle_unknown = "ignore"), cat_cols),
#     ('testSource', CountVectorizer(max_features=100), testSource_col),
# ], remainder='passthrough')

X_train = dataTrain[other_cols + std_cols + cat_cols + [testSource_col]]
X_test = dataTest[other_cols + std_cols + cat_cols + [testSource_col]]

y_train = dataTrain["label"]
y_test = dataTest["label"]

Train set:
Counter({1: 1264973, 0: 187077})
Test set:
Counter({1: 217503, 0: 2320})


In [345]:
# NORMAL PIPELINE
from numpy import mean
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, r2_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn import set_config
set_config(display="diagram")
from sklearn.base import TransformerMixin, BaseEstimator
from imblearn.ensemble import BalancedRandomForestClassifier 
from sklearn.ensemble import RandomForestClassifier

# Pipeline
smote = SMOTE(sampling_strategy=0.4)
featureSelection = SelectKBest(chi2, k=60)
rfc = BalancedRandomForestClassifier(n_estimators=200, n_jobs=14, verbose=1)

steps = [
    ('trans', cols_trans),
#     ('fs', featureSelection),
    ('s', smote), 
    ('m', rfc)
]
pipe = Pipeline(steps=steps)
display(pipe)

In [346]:
# Fit and test
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Scores
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nPrecision", precision)
print("Recall", recall)
print("MCC", mcc)
print("F1", f1)
print("R2", r2)

[Parallel(n_jobs=14)]: Using backend ThreadingBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done  22 tasks      | elapsed:  1.2min
[Parallel(n_jobs=14)]: Done 172 tasks      | elapsed: 10.6min
[Parallel(n_jobs=14)]: Done 200 out of 200 | elapsed: 11.6min finished
[Parallel(n_jobs=14)]: Using backend ThreadingBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done  22 tasks      | elapsed:    0.3s
[Parallel(n_jobs=14)]: Done 172 tasks      | elapsed:    1.0s
[Parallel(n_jobs=14)]: Done 200 out of 200 | elapsed:    1.2s finished



Precision 0.9954188542489429
Recall 0.9850162986257661
MCC 0.40010839105685064
F1 0.9901902558852688
R2 -0.8492583514900396


In [None]:
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

# Confusion matrix
metrics.plot_confusion_matrix(pipe, X_test, y_test, normalize=None, cmap='Blues', 
                              display_labels=["Non-Flaky", "Flaky"], values_format = '.0f')
plt.show()

In [None]:
import numpy as np

# Information about false positives
fp = np.logical_and(y_test != y_pred, y_pred == 1)
tn = np.logical_and(y_test == y_pred, y_test == 0)
X_fp = X_test[fp]
X_tn = X_test[tn]
data_fp = dataTest.loc[X_fp.index]
data_fp_fr0 = data_fp[data_fp["flakeRate"] > 0]

print("Number of FP:", len(X_fp))
print("Number of FP:", len(X_tn))
print("Number of FP with flake Rate > 0:", len(data_fp_fr0))
print("FPR:", len(X_fp) / (len(X_fp) + len(X_tn)))