# Mount drive

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Install dependencies

In [2]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


# Import libraries

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import gensim.downloader
from lazypredict.Supervised import LazyClassifier

# SEED for reproducibility
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Load data

In [30]:
X_column = 'RequirementText'
y_column = 'Class'

CLASSICAL_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/quantum.csv'

df = pd.read_csv(CLASSICAL_PATH, index_col=False)

# Vectorize train set

In [31]:
def getTrainSet(df, vectorization_technique='BoW'):
    if vectorization_technique == 'BoW':
        countvec = CountVectorizer(max_features=100)
        vectors = countvec.fit_transform(df[X_column]).toarray()
    elif vectorization_technique == 'TFIDF':
        countvec = CountVectorizer(max_features=100)
        bow = countvec.fit_transform(df[X_column]).toarray()
        tfidfconverter = TfidfTransformer()
        vectors = tfidfconverter.fit_transform(bow).toarray()
    elif vectorization_technique == 'Word2Vec':
        w2v_pretrained = gensim.downloader.load('word2vec-google-news-300')
        vectors = []
        for msg in df[X_column]:
            words = msg.split()
            vecs = [w2v_pretrained[word][:300] for word in words if word in w2v_pretrained]
            vec_avg = sum(vecs) / len(vecs) if vecs else [0] * 300
            vectors.append(vec_avg)
        vectors = pd.DataFrame(vectors)
        vectors.columns = vectors.columns.astype(str)
    else:
        raise ValueError("Invalid vectorization technique. Choose from 'BoW', 'TFIDF', or 'Word2Vec'.")
    return pd.DataFrame(vectors)

vect_tech = 'BoW'

X = getTrainSet(df, vect_tech)

y_se = df[y_column].map(lambda x: 1 if x == 'SE' else 0)
y_us = df[y_column].map(lambda x: 1 if x == 'US' else 0)
y_o = df[y_column].map(lambda x: 1 if x == 'O' else 0)
y_pe = df[y_column].map(lambda x: 1 if x == 'PE' else 0)

# def getTrainSetBoW(df):
#     countvec = CountVectorizer(max_features=100)
#     bow = countvec.fit_transform(df[X_column]).toarray()
#     training_data = pd.DataFrame(bow)
#     training_data.columns = training_data.columns.astype(str)
#     return training_data

# def getTrainSetTFIDF(df):
#     countvec = CountVectorizer(max_features=100)
#     bow = countvec.fit_transform(df[X_column]).toarray()
#     tfidfconverter = TfidfTransformer()
#     X = tfidfconverter.fit_transform(bow).toarray()
#     training_data = pd.DataFrame(X)
#     training_data.columns = training_data.columns.astype(str)
#     return training_data

# def getTrainSetWord2Vec(df):
#     w2v_pretrained = gensim.downloader.load('word2vec-google-news-300')
#     training_data = []
#     for msg in df[X_column]:
#         words = msg.split()
#         vecs = []
#         for word in words:
#             if word in w2v_pretrained:
#                 vecs.append(w2v_pretrained[word][:300])
#         if vecs:
#             vec_avg = sum(vecs) / len(vecs)
#         else:
#             vec_avg = [0] * 300
#         training_data.append(vec_avg)

#     training_data = pd.DataFrame(training_data)
#     training_data.columns = training_data.columns.astype(str)
    # return training_data

# Training

In [32]:
def custom_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return {"precision": precision, "recall": recall}

In [33]:
global_results = []

for y in [y_se, y_us, y_o, y_pe]:
    results = pd.DataFrame(columns=["Fold", "Model", "Accuracy", "F1-Score"])
    fold = KFold(n_splits=10, random_state=SEED, shuffle=True)
    foldcounter = 1
    for train_index, test_index in fold.split(X, y):
        print("Processing Fold "+ str(foldcounter) + " ...")
        X_train, X_test, y_train, y_test = \
        X[X.index.isin(train_index)], X[X.index.isin(test_index)], y[train_index], y[test_index]
        clf = LazyClassifier(verbose=-100, ignore_warnings=True, custom_metric=custom_metrics)
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        for model in models[:].iterrows():
            result = pd.DataFrame({
                "Fold": [foldcounter],
                "Model": [model[0]],
                "Accuracy": [round(model[1][0], 3)],
                "F1-Score": [round(model[1][3], 3)]})
            results = results.append(result)
        foldcounter += 1
    global_results.append(results)

Processing Fold 1 ...


100%|██████████| 29/29 [00:01<00:00, 15.94it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 205
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.355346 -> initscore=-0.595622
[LightGBM] [Info] Start training from score -0.595622
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 15.22it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 205
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.355346 -> initscore=-0.595622
[LightGBM] [Info] Start training from score -0.595622
Processing Fold 3 ...


100%|██████████| 29/29 [00:02<00:00, 10.91it/s]


[LightGBM] [Info] Number of positive: 116, number of negative: 202
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364780 -> initscore=-0.554678
[LightGBM] [Info] Start training from score -0.554678
Processing Fold 4 ...


100%|██████████| 29/29 [00:02<00:00, 10.79it/s]


[LightGBM] [Info] Number of positive: 112, number of negative: 206
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.352201 -> initscore=-0.609377
[LightGBM] [Info] Start training from score -0.609377
Processing Fold 5 ...


100%|██████████| 29/29 [00:02<00:00, 13.59it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 206
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354232 -> initscore=-0.600488
[LightGBM] [Info] Start training from score -0.600488
Processing Fold 6 ...


100%|██████████| 29/29 [00:01<00:00, 17.96it/s]


[LightGBM] [Info] Number of positive: 112, number of negative: 207
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.351097 -> initscore=-0.614220
[LightGBM] [Info] Start training from score -0.614220
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 16.64it/s]


[LightGBM] [Info] Number of positive: 111, number of negative: 208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.347962 -> initscore=-0.628008
[LightGBM] [Info] Start training from score -0.628008
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 16.61it/s]


[LightGBM] [Info] Number of positive: 111, number of negative: 208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.347962 -> initscore=-0.628008
[LightGBM] [Info] Start training from score -0.628008
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 16.99it/s]


[LightGBM] [Info] Number of positive: 114, number of negative: 205
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357367 -> initscore=-0.586812
[LightGBM] [Info] Start training from score -0.586812
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 16.27it/s]


[LightGBM] [Info] Number of positive: 119, number of negative: 200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.373041 -> initscore=-0.519194
[LightGBM] [Info] Start training from score -0.519194
Processing Fold 1 ...


100%|██████████| 29/29 [00:02<00:00, 10.38it/s]


[LightGBM] [Info] Number of positive: 77, number of negative: 241
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242138 -> initscore=-1.140992
[LightGBM] [Info] Start training from score -1.140992
Processing Fold 2 ...


100%|██████████| 29/29 [00:03<00:00,  9.25it/s]


[LightGBM] [Info] Number of positive: 76, number of negative: 242
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.238994 -> initscore=-1.158204
[LightGBM] [Info] Start training from score -1.158204
Processing Fold 3 ...


100%|██████████| 29/29 [00:02<00:00, 13.10it/s]


[LightGBM] [Info] Number of positive: 71, number of negative: 247
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.223270 -> initscore=-1.246708
[LightGBM] [Info] Start training from score -1.246708
Processing Fold 4 ...


100%|██████████| 29/29 [00:01<00:00, 15.76it/s]


[LightGBM] [Info] Number of positive: 74, number of negative: 244
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.232704 -> initscore=-1.193103
[LightGBM] [Info] Start training from score -1.193103
Processing Fold 5 ...


100%|██████████| 29/29 [00:01<00:00, 16.44it/s]


[LightGBM] [Info] Number of positive: 80, number of negative: 239
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250784 -> initscore=-1.094437
[LightGBM] [Info] Start training from score -1.094437
Processing Fold 6 ...


100%|██████████| 29/29 [00:01<00:00, 17.23it/s]


[LightGBM] [Info] Number of positive: 76, number of negative: 243
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.238245 -> initscore=-1.162328
[LightGBM] [Info] Start training from score -1.162328
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 15.97it/s]


[LightGBM] [Info] Number of positive: 79, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247649 -> initscore=-1.111191
[LightGBM] [Info] Start training from score -1.111191
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 15.06it/s]


[LightGBM] [Info] Number of positive: 78, number of negative: 241
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244514 -> initscore=-1.128088
[LightGBM] [Info] Start training from score -1.128088
Processing Fold 9 ...


100%|██████████| 29/29 [00:02<00:00, 10.42it/s]


[LightGBM] [Info] Number of positive: 79, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247649 -> initscore=-1.111191
[LightGBM] [Info] Start training from score -1.111191
Processing Fold 10 ...


100%|██████████| 29/29 [00:02<00:00, 10.04it/s]


[LightGBM] [Info] Number of positive: 75, number of negative: 244
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.235110 -> initscore=-1.179680
[LightGBM] [Info] Start training from score -1.179680
Processing Fold 1 ...


100%|██████████| 29/29 [00:02<00:00, 13.85it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 248
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220126 -> initscore=-1.264934
[LightGBM] [Info] Start training from score -1.264934
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 17.88it/s]


[LightGBM] [Info] Number of positive: 69, number of negative: 249
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216981 -> initscore=-1.283346
[LightGBM] [Info] Start training from score -1.283346
Processing Fold 3 ...


100%|██████████| 29/29 [00:01<00:00, 16.04it/s]


[LightGBM] [Info] Number of positive: 71, number of negative: 247
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.223270 -> initscore=-1.246708
[LightGBM] [Info] Start training from score -1.246708
Processing Fold 4 ...


100%|██████████| 29/29 [00:01<00:00, 16.54it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 248
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220126 -> initscore=-1.264934
[LightGBM] [Info] Start training from score -1.264934
Processing Fold 5 ...


100%|██████████| 29/29 [00:01<00:00, 16.51it/s]


[LightGBM] [Info] Number of positive: 64, number of negative: 255
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.200627 -> initscore=-1.382380
[LightGBM] [Info] Start training from score -1.382380
Processing Fold 6 ...


100%|██████████| 29/29 [00:02<00:00, 13.75it/s]


[LightGBM] [Info] Number of positive: 68, number of negative: 251
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.213166 -> initscore=-1.305945
[LightGBM] [Info] Start training from score -1.305945
Processing Fold 7 ...


100%|██████████| 29/29 [00:02<00:00, 10.10it/s]


[LightGBM] [Info] Number of positive: 73, number of negative: 246
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.228840 -> initscore=-1.214872
[LightGBM] [Info] Start training from score -1.214872
Processing Fold 8 ...


100%|██████████| 29/29 [00:02<00:00, 10.28it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 249
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219436 -> initscore=-1.268958
[LightGBM] [Info] Start training from score -1.268958
Processing Fold 9 ...


100%|██████████| 29/29 [00:02<00:00, 12.94it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 249
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219436 -> initscore=-1.268958
[LightGBM] [Info] Start training from score -1.268958
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 16.03it/s]


[LightGBM] [Info] Number of positive: 68, number of negative: 251
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.213166 -> initscore=-1.305945
[LightGBM] [Info] Start training from score -1.305945
Processing Fold 1 ...


100%|██████████| 29/29 [00:01<00:00, 16.24it/s]


[LightGBM] [Info] Number of positive: 58, number of negative: 260
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.182390 -> initscore=-1.500239
[LightGBM] [Info] Start training from score -1.500239
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 16.04it/s]


[LightGBM] [Info] Number of positive: 60, number of negative: 258
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188679 -> initscore=-1.458615
[LightGBM] [Info] Start training from score -1.458615
Processing Fold 3 ...


100%|██████████| 29/29 [00:01<00:00, 16.88it/s]


[LightGBM] [Info] Number of positive: 60, number of negative: 258
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188679 -> initscore=-1.458615
[LightGBM] [Info] Start training from score -1.458615
Processing Fold 4 ...


100%|██████████| 29/29 [00:01<00:00, 15.86it/s]


[LightGBM] [Info] Number of positive: 62, number of negative: 256
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194969 -> initscore=-1.418043
[LightGBM] [Info] Start training from score -1.418043
Processing Fold 5 ...


100%|██████████| 29/29 [00:02<00:00, 10.80it/s]


[LightGBM] [Info] Number of positive: 62, number of negative: 257
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194357 -> initscore=-1.421942
[LightGBM] [Info] Start training from score -1.421942
Processing Fold 6 ...


100%|██████████| 29/29 [00:03<00:00,  9.57it/s]


[LightGBM] [Info] Number of positive: 63, number of negative: 256
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197492 -> initscore=-1.402043
[LightGBM] [Info] Start training from score -1.402043
Processing Fold 7 ...


100%|██████████| 29/29 [00:02<00:00, 13.39it/s]


[LightGBM] [Info] Number of positive: 56, number of negative: 263
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175549 -> initscore=-1.546802
[LightGBM] [Info] Start training from score -1.546802
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 16.36it/s]


[LightGBM] [Info] Number of positive: 60, number of negative: 259
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188088 -> initscore=-1.462483
[LightGBM] [Info] Start training from score -1.462483
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 17.32it/s]


[LightGBM] [Info] Number of positive: 56, number of negative: 263
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175549 -> initscore=-1.546802
[LightGBM] [Info] Start training from score -1.546802
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 16.80it/s]

[LightGBM] [Info] Number of positive: 57, number of negative: 262
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178683 -> initscore=-1.525293
[LightGBM] [Info] Start training from score -1.525293





In [34]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,custom_metrics,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NearestCentroid,0.97,0.98,0.98,0.97,"{'precision': 0.9, 'recall': 1.0}",0.03
BernoulliNB,0.94,0.93,0.93,0.94,"{'precision': 0.8888888888888888, 'recall': 0....",0.02
ExtraTreeClassifier,0.89,0.89,0.89,0.89,"{'precision': 0.7272727272727273, 'recall': 0....",0.02
RandomForestClassifier,0.91,0.87,0.87,0.91,"{'precision': 0.875, 'recall': 0.7777777777777...",0.24
BaggingClassifier,0.86,0.87,0.87,0.86,"{'precision': 0.6666666666666666, 'recall': 0....",0.06
DecisionTreeClassifier,0.86,0.87,0.87,0.86,"{'precision': 0.6666666666666666, 'recall': 0....",0.02
LGBMClassifier,0.91,0.83,0.83,0.91,"{'precision': 1.0, 'recall': 0.6666666666666666}",0.09
XGBClassifier,0.91,0.83,0.83,0.91,"{'precision': 1.0, 'recall': 0.6666666666666666}",0.06
ExtraTreesClassifier,0.83,0.81,0.81,0.83,"{'precision': 0.6363636363636364, 'recall': 0....",0.16
Perceptron,0.83,0.78,0.78,0.83,"{'precision': 0.6666666666666666, 'recall': 0....",0.02


# Security requirements

In [35]:
grouped = global_results[0].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
BaggingClassifier                  0.87      0.87
BernoulliNB                        0.87      0.87
AdaBoostClassifier                 0.87      0.87
ExtraTreesClassifier               0.87      0.87
RandomForestClassifier             0.86      0.86
NearestCentroid                    0.86      0.86
XGBClassifier                      0.86      0.85
SGDClassifier                      0.85      0.85
CategoricalNB                      0.85      0.84
LogisticRegression                 0.84      0.85
NuSVC                              0.84      0.84
SVC                                0.84      0.84
Perceptron                         0.84      0.84
RidgeClassifier                    0.84      0.83
LinearDiscriminantAnalysis         0.84      0.83
RidgeClassifierCV                  0.83      0.83
PassiveAggressiveClassifier        0.83      0.83
CalibratedClassifierCV             0.82      0.81


# Usability requirements

In [36]:
grouped = global_results[1].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
NuSVC                              0.86      0.79
RandomForestClassifier             0.85      0.84
RidgeClassifierCV                  0.84      0.83
SVC                                0.83      0.80
KNeighborsClassifier               0.83      0.82
NearestCentroid                    0.83      0.83
BernoulliNB                        0.83      0.82
XGBClassifier                      0.83      0.82
ExtraTreesClassifier               0.83      0.82
LinearDiscriminantAnalysis         0.82      0.82
LogisticRegression                 0.82      0.82
RidgeClassifier                    0.82      0.82
AdaBoostClassifier                 0.81      0.81
SGDClassifier                      0.80      0.80
CategoricalNB                      0.80      0.74
BaggingClassifier                  0.80      0.79
Perceptron                         0.79      0.80
LGBMClassifier                     0.79      0.76


# Operational requirements

In [37]:
grouped = global_results[2].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
BernoulliNB                        0.87      0.86
RidgeClassifier                    0.86      0.85
LogisticRegression                 0.86      0.86
LinearDiscriminantAnalysis         0.86      0.85
NearestCentroid                    0.86      0.86
SVC                                0.86      0.83
RidgeClassifierCV                  0.86      0.85
SGDClassifier                      0.85      0.84
Perceptron                         0.85      0.85
AdaBoostClassifier                 0.84      0.83
XGBClassifier                      0.84      0.82
RandomForestClassifier             0.84      0.83
CategoricalNB                      0.83      0.80
CalibratedClassifierCV             0.83      0.80
QuadraticDiscriminantAnalysis      0.83      0.83
LabelPropagation                   0.83      0.81
LabelSpreading                     0.83      0.81
PassiveAggressiveClassifier        0.83      0.83


# Performance requirements

In [38]:
grouped = global_results[3].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
NearestCentroid                    0.92      0.92
BernoulliNB                        0.92      0.91
RidgeClassifierCV                  0.90      0.89
RidgeClassifier                    0.90      0.89
XGBClassifier                      0.90      0.89
LogisticRegression                 0.90      0.89
LGBMClassifier                     0.89      0.88
RandomForestClassifier             0.89      0.89
ExtraTreesClassifier               0.89      0.89
LinearDiscriminantAnalysis         0.89      0.88
SGDClassifier                      0.88      0.88
AdaBoostClassifier                 0.88      0.88
SVC                                0.88      0.86
QuadraticDiscriminantAnalysis      0.87      0.84
KNeighborsClassifier               0.87      0.85
CalibratedClassifierCV             0.87      0.84
CategoricalNB                      0.86      0.82
ExtraTreeClassifier                0.85      0.85
