In [None]:
import pandas as pd
data = pd.read_json('dataset.35past.Windows.json')
data.shape

In [None]:
import numpy as np
from pprint import pprint
from collections import Counter
# CLEAN DATA

# Remove test_fail 
# data = data[~data["testId"].isin(["telemetry_unittests_ninja://chrome/test:telemetry_unittests/unit_tests_test.ExampleTests.test_fail"])]

# Keep flaky failures and legit failures only
data = data[((data["flakeRate"] == 0) & (data["label"] == 1)) | (data["label"] == 0)]
# data = data.drop(data[((data['flakeRate'] == 0) & (data["label"] == 0))].sample(frac=.95).index)

# Drop test duplicates
# data = data.drop_duplicates(subset=["testId"])
data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

# Specific test suite
# Linux. GUI -> GUI 
# data = data[data["testSuite"].isin(["blink_web_tests", "not_site_per_process_blink_web_tests", "vulkan_swiftshader_blink_web_tests", "non_skia_renderer_swiftshader_blink_web_tests", "interactive_ui_tests"])]
# Linux. Integration -> Integration 
# data = data[data["testSuite"].isin(["browser_tests","sync_integration_tests","content_browsertests","weblayer_browsertests","extensions_browsertests","headless_browsertests","components_browsertests"])]
# Linux. Unit -> Unit 
data = data[~data["testSuite"].isin(["browser_tests","sync_integration_tests","content_browsertests","weblayer_browsertests","extensions_browsertests","headless_browsertests","components_browsertests", "blink_web_tests", "not_site_per_process_blink_web_tests", "vulkan_swiftshader_blink_web_tests", "non_skia_renderer_swiftshader_blink_web_tests", "interactive_ui_tests"])]

# Windows. GUI -> GUI 
# data = data[data["testSuite"].isin(["blink_web_tests", "pixel_browser_tests", "interactive_ui_tests", "non_skia_renderer_content_browsertests"])]
# Windows. Integration -> Integration 
# data = data[data["testSuite"].isin(["browser_tests","content_browsertests","sbox_integration_tests","extensions_browsertests","cronet_tests","sync_integration_tests","headless_browsertests"])]
# Windows. Unit -> Unit 
# data = data[~data["testSuite"].isin(["browser_tests","content_browsertests","sbox_integration_tests","extensions_browsertests","cronet_tests","sync_integration_tests","headless_browsertests", "blink_web_tests", "pixel_browser_tests", "interactive_ui_tests", "non_skia_renderer_content_browsertests"])]

# Window-based
# Linux first 500
# data = data[(data["buildId"] >= 98019) & (data["buildId"] <= 98419)]
# Linux last 500
# data = data[(data["buildId"] > 98419)]
data.reset_index(drop=True, inplace=True)
print(Counter(data["label"]))
data.shape

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, Binarizer

# Columns definition
testSource_col = "testSource"
stackTrace_col = "stackTrace"
command_col = "command"
stderr_col = "stderr"
crashlog_col = "crashlog"
cat_cols = ["runStatus", "runTagStatus", "testSuite"]
std_cols = ["flakeRate"]
bin_cols = ['stackTraceLength', 'commandLength', 'stderrLength', 'crashlogLength', 'testSourceLength']
other_cols = ["runDuration"]

# Fair features
X = data[cat_cols + other_cols + std_cols + [testSource_col, stackTrace_col, command_col, stderr_col, crashlog_col]]
y = data['label']

# Columns transformers
cols_trans = ColumnTransformer([
    ('categories', OneHotEncoder(handle_unknown = "ignore"), cat_cols),
    ('stackTrace', TfidfVectorizer(max_features=100), stackTrace_col),
    ('command', TfidfVectorizer(max_features=100), command_col),
    ('stderr', TfidfVectorizer(max_features=100), stderr_col),
    ('crashlog', TfidfVectorizer(max_features=100), crashlog_col),
    ('testSource', TfidfVectorizer(max_features=100), testSource_col),
    ], remainder='passthrough')

# Voc features
# X = data[[testSource_col]]
# y = data['label']

# # # Columns transformers
# cols_trans = ColumnTransformer([
#     ('testSource', CountVectorizer(), testSource_col),
#     ], remainder='drop')


In [None]:
# Split Train and Test set
from sklearn.model_selection import train_test_split

timeSensitive = False

# TIME SENSITIVE
if timeSensitive:
    ts = 0.20
    y_flaky = y[y == 0]
    y_legit = y[y == 1]
    X_legit = X.iloc[y_legit.index, :]
    X_flaky = X.iloc[y_flaky.index, :]

    X_flaky_train, X_flaky_test, y_flaky_train, y_flaky_test = train_test_split(X_flaky, y_flaky, test_size=ts, shuffle=False)
    X_legit_train, X_legit_test, y_legit_train, y_legit_test = train_test_split(X_legit, y_legit, test_size=ts, shuffle=False)

    X_train = pd.concat([X_flaky_train, X_legit_train])
    y_train = pd.concat([y_flaky_train, y_legit_train])
    X_test = pd.concat([X_flaky_test, X_legit_test])
    y_test = pd.concat([y_flaky_test, y_legit_test])

# NORMAL SPLIT
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
    
# GENERAL INFO
print("Train set:")
print(Counter(y_train))
print("Test set:")
print(Counter(y_test))

In [None]:
# GRID SEARCH
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
import numpy as np

# Scoring functions
# def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
# def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
# def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
# def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
# def recall(y_true, y_pred): return tp(y_true, y_pred) / (tp(y_true, y_pred) + fn(y_true, y_pred))
# def precision(y_true, y_pred): return tp(y_true, y_pred) / (tp(y_true, y_pred) + fp(y_true, y_pred))
# def f1(y_true, y_pred): return 2*tp(y_true, y_pred) / (2*tp(y_true, y_pred) + fp(y_true, y_pred) + fn(y_true, y_pred))
# def mcc(y_true, y_pred): return (tp(y_true, y_pred) * tn(y_true, y_pred) - fp(y_true, y_pred) * fn(y_true, y_pred)) / np.sqrt((tp(y_true, y_pred) + fp(y_true, y_pred)) * (tp(y_true, y_pred) + fn(y_true, y_pred)) * (tn(y_true, y_pred) + fp(y_true, y_pred)) * (tn(y_true, y_pred) + fn(y_true, y_pred)))
# def displayScores(scores, title):
#     print("\nMetric: ", title)
#     print("Scores: ", scores)
#     print("Accuracy: %0.2f (+/- %0.2f)" % (np.nanmean(scores), np.nanstd(scores) * 2))

# # Model with cross validation
# scoring = {
#     'precision': make_scorer(precision), 
#     'recall': make_scorer(recall), 
#     'f1': make_scorer(f1), 
#     'mcc': make_scorer(mcc)
# }

# # scores = cross_validate(pipe, X, y, cv=5, scoring=scoring, n_jobs=14)
# # displayScores(scores['test_precision'], "Precision")
# # displayScores(scores['test_recall'], "Recall")
# # displayScores(scores['test_f1'], "F1")
# # displayScores(scores['test_mcc'], "MCC")

# # Grid search
# param_grid = {
#     "fs__k": [50, 100, 200]
# }
# search = GridSearchCV(pipe, param_grid, cv=2, verbose=1, n_jobs=14)
# search.fit(X_train, y_train)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)

In [None]:
# NORMAL PIPELINE
from numpy import mean
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, r2_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn import set_config
set_config(display="diagram")
from sklearn.base import TransformerMixin, BaseEstimator
from imblearn.ensemble import BalancedRandomForestClassifier 

# Pipeline
smote = SMOTE(sampling_strategy=1)
featureSelection = SelectKBest(chi2, k=200)
rfc = BalancedRandomForestClassifier(n_estimators=100)

steps = [
    ('trans', cols_trans),
    ('fs', featureSelection),
    ('s', smote), 
    ('m', rfc)
]
pipe = Pipeline(steps=steps)
display(pipe)

In [None]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
# Selected features
mask = pipe.named_steps["fs"].get_support() #list of booleans
new_features = [] # The list of your K best features
feature_names = pipe.named_steps["trans"].get_feature_names()
print(len(feature_names))
for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
pprint(new_features)

In [None]:
#Scores
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nPrecision", precision)
print("Recall", recall)
print("MCC", mcc)
print("F1", f1)
print("R2", r2)

In [None]:
#Features importance
print("Number of features before feature selection:", len(pipe.named_steps["trans"].get_feature_names()))
# In case of no feature selection
# zipped = zip(pipe.named_steps["trans"].get_feature_names(), pipe.named_steps["m"].feature_importances_)

# In case of feature selection
zipped = zip(new_features, pipe.named_steps["m"].feature_importances_)

zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
for feature, importance in zipped:
    print('{}: {}'.format(feature, importance))