In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
# skipCI dataset
columns = ['ci_skipped', 'ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev',
       'age', 'nuc', 'exp', 'rexp', 'sexp', 'TFC', 'is_doc', 'is_build',
       'is_meta', 'is_media', 'is_src', 'is_merge', 'FRM', 'COM', 'CFT',
       'classif', 'prev_com_res', 'proj_recent_skip', 'comm_recent_skip',
       'same_committer', 'is_fix', 'day_week', 'CM', 'commit_hash']

path = '/content/drive/MyDrive/CI/SkipCI-dataset'
path = '/mnt/d/PFE/Papers Presentations/1SkipCI/SkipCI/dataset/'

# projects list: 
# candybar-library.csv  GI.csv               mtsar.csv     ransack.csv     SemanticMediaWiki.csv
# contextlogger.csv     grammarviz2_src.csv  parallec.csv  SAX.csv         solr-iso639-filter.csv
# future.csv            groupdate.csv        pghero.csv    searchkick.csv  steve.csv

valid_proj = 'SemanticMediaWiki.csv'
cols_to_keep = 32


In [3]:
def train(X_train, X_val, y_train, y_val, df, eval_meth):
    print(eval_meth, m)
    if m=='rf':
        model = RandomForestClassifier(random_state=42)
    if m=='dt':
        model = DecisionTreeClassifier(random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Classification metrics calculations
    report = classification_report(y_val, y_pred)
    confusion = confusion_matrix(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred)

    print(report)
    print('Confusion Matrix')
    print(confusion)
    print('\nF1=%.3f' % (f1))
    print('\nAUC=%.3f' % (auc))

In [4]:
def within_eval(valid_proj):
    df = pd.read_csv(os.path.join(path, valid_proj))
    X = df.iloc[:,1:cols_to_keep]
    y = df.iloc[:,0].astype(int)

    from sklearn.model_selection import train_test_split
    X_train, X_val , y_train, y_val = train_test_split(np.array(X), np.array(y), test_size=0.2, shuffle=True, stratify=y, random_state=42) # keep ratio of classes in split

    eval_meth = f'within_proj_{valid_proj}'[:-4]

    train(X_train, X_val, y_train, y_val, df, eval_meth)

In [9]:
from imblearn.over_sampling import SMOTE

def cross_eval(valid_proj, oversample=0):

    df_train = pd.DataFrame(columns=columns, dtype='object')
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if filename[-4:]==".csv" and filename!=valid_proj:
                df_train = pd.concat([df_train, pd.read_csv(os.path.join(dirname, filename))])

    X_train = np.array(df_train.iloc[:,1:cols_to_keep])
    y_train = np.array(df_train.iloc[:,0].astype(int))

    df_val = pd.read_csv(os.path.join(path, valid_proj))
    df = df_val 

    X_val = np.array(df_val.iloc[:,1:cols_to_keep])
    y_val = np.array(df_val.iloc[:,0].astype(int))

    eval_meth = f'cross_proj_{valid_proj}'[:-4]

    if oversample:
        oversample = SMOTE()
        X_train, y_train = oversample.fit_resample(X_train, y_train)
        X_val, y_val = oversample.fit_resample(X_val, y_val)

    train(X_train, X_val, y_train, y_val, df, eval_meth)

In [6]:
m='rf'
within_eval("mtsar.csv")

within_proj_mtsar rf
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        52
           1       0.62      0.48      0.54        27

    accuracy                           0.72        79
   macro avg       0.69      0.66      0.67        79
weighted avg       0.71      0.72      0.71        79

Confusion Matrix
[[44  8]
 [14 13]]

F1=0.542

AUC=0.664


In [10]:
m='rf'
cross_eval("pghero.csv", oversample=1)

cross_proj_pghero rf
              precision    recall  f1-score   support

           0       0.83      0.95      0.89       503
           1       0.94      0.81      0.87       503

    accuracy                           0.88      1006
   macro avg       0.89      0.88      0.88      1006
weighted avg       0.89      0.88      0.88      1006

Confusion Matrix
[[479  24]
 [ 98 405]]

F1=0.869

AUC=0.879


In [None]:
for valid_proj in ['candybar-library.csv','GI.csv', 'mtsar.csv', 'ransack.csv', 'SemanticMediaWiki.csv', 'contextlogger.csv', 'grammarviz2_src.csv', 'parallec.csv', 'SAX.csv', 'solr-iso639-filter.csv', 'future.csv', 'groupdate.csv', 'pghero.csv', 'searchkick.csv', 'steve.csv']:
    within_eval(valid_proj)
    #cross_eval(valid_proj)