# Mount drive

In [58]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Install dependencies

In [59]:
!pip install lazypredict



# Import libraries

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import gensim.downloader
from lazypredict.Supervised import LazyClassifier

# SEED for reproducibility
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Load data

In [64]:
X_column = 'RequirementText'
y_column = 'Class'

SE_DS_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/balanced/quantum_SE.csv'
US_DS_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/balanced/quantum_US.csv'
O_DS_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/balanced/quantum_O.csv'
PE_DS_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/balanced/quantum_PE.csv'

CLASSICAL_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/classical.csv'

se_df = pd.read_csv(SE_DS_PATH, index_col=False)z
us_df = pd.read_csv(US_DS_PATH, index_col=False)
o_df = pd.read_csv(O_DS_PATH, index_col=False)
pe_df = pd.read_csv(PE_DS_PATH, index_col=False)

In [65]:
se_df[y_column] = se_df[y_column].map(lambda x: 1 if x == 'SE' else 0)
us_df[y_column] = us_df[y_column].map(lambda x: 1 if x == 'US' else 0)
o_df[y_column] = o_df[y_column].map(lambda x: 1 if x == 'O' else 0)
pe_df[y_column] = pe_df[y_column].map(lambda x: 1 if x == 'PE' else 0)

# Vectorize train set

In [66]:
def getTrainSet(df, vectorization_technique='BoW'):
    if vectorization_technique == 'BoW':
        countvec = CountVectorizer(max_features=100)
        vectors = countvec.fit_transform(df[X_column]).toarray()
    elif vectorization_technique == 'TFIDF':
        countvec = CountVectorizer(max_features=100)
        bow = countvec.fit_transform(df[X_column]).toarray()
        tfidfconverter = TfidfTransformer()
        vectors = tfidfconverter.fit_transform(bow).toarray()
    elif vectorization_technique == 'Word2Vec':
        w2v_pretrained = gensim.downloader.load('word2vec-google-news-300')
        vectors = []
        for msg in df[X_column]:
            words = msg.split()
            vecs = [w2v_pretrained[word][:300] for word in words if word in w2v_pretrained]
            vec_avg = sum(vecs) / len(vecs) if vecs else [0] * 300
            vectors.append(vec_avg)
        vectors = pd.DataFrame(vectors)
        vectors.columns = vectors.columns.astype(str)
    else:
        raise ValueError("Invalid vectorization technique. Choose from 'BoW', 'TFIDF', or 'Word2Vec'.")
    return pd.DataFrame(vectors)

vect_tech = 'BoW'

X_se = getTrainSet(se_df, vect_tech)
y_se = se_df[y_column]

X_us = getTrainSet(us_df, vect_tech)
y_us = us_df[y_column]

X_o = getTrainSet(o_df, vect_tech)
y_o = o_df[y_column]

X_pe = getTrainSet(pe_df, vect_tech)
y_pe = pe_df[y_column]

# def getTrainSetBoW(df):
#     countvec = CountVectorizer(max_features=100)
#     bow = countvec.fit_transform(df[X_column]).toarray()
#     training_data = pd.DataFrame(bow)
#     training_data.columns = training_data.columns.astype(str)
#     return training_data

# def getTrainSetTFIDF(df):
#     countvec = CountVectorizer(max_features=100)
#     bow = countvec.fit_transform(df[X_column]).toarray()
#     tfidfconverter = TfidfTransformer()
#     X = tfidfconverter.fit_transform(bow).toarray()
#     training_data = pd.DataFrame(X)
#     training_data.columns = training_data.columns.astype(str)
#     return training_data

# def getTrainSetWord2Vec(df):
#     w2v_pretrained = gensim.downloader.load('word2vec-google-news-300')
#     training_data = []
#     for msg in df[X_column]:
#         words = msg.split()
#         vecs = []
#         for word in words:
#             if word in w2v_pretrained:
#                 vecs.append(w2v_pretrained[word][:300])
#         if vecs:
#             vec_avg = sum(vecs) / len(vecs)
#         else:
#             vec_avg = [0] * 300
#         training_data.append(vec_avg)

#     training_data = pd.DataFrame(training_data)
#     training_data.columns = training_data.columns.astype(str)
    # return training_data

# Training

In [69]:
all_X = [X_se, X_us, X_o, X_pe]
all_y = [y_se, y_us, y_o, y_pe]

global_results = []

for X, y in zip(all_X, all_y):
    results = pd.DataFrame(columns=["Fold", "Model", "Accuracy", "F1-Score"])
    fold = KFold(n_splits=10, random_state=SEED, shuffle=True)
    foldcounter = 1
    for train_index, test_index in fold.split(X, y):
        print("Processing Fold "+ str(foldcounter) + " ...")
        X_train, X_test, y_train, y_test = \
        X[X.index.isin(train_index)], X[X.index.isin(test_index)], y[train_index], y[test_index]
        clf = LazyClassifier(verbose=-100, ignore_warnings=True, custom_metric=None)
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)
        for model in models[:].iterrows():
            result = pd.DataFrame({
                "Fold": [foldcounter],
                "Model": [model[0]],
                "Accuracy": [round(model[1][0], 3)],
                "F1-Score": [round(model[1][3], 3)]})
            results = results.append(result)
        foldcounter += 1
    global_results.append(results)

Processing Fold 1 ...


100%|██████████| 29/29 [00:05<00:00,  5.34it/s]


[LightGBM] [Info] Number of positive: 111, number of negative: 115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47
[LightGBM] [Info] Number of data points in the train set: 226, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491150 -> initscore=-0.035402
[LightGBM] [Info] Start training from score -0.035402
Processing Fold 2 ...


100%|██████████| 29/29 [00:02<00:00, 13.93it/s]


[LightGBM] [Info] Number of positive: 114, number of negative: 112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 226, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504425 -> initscore=0.017700
[LightGBM] [Info] Start training from score 0.017700
Processing Fold 3 ...


100%|██████████| 29/29 [00:01<00:00, 19.07it/s]


[LightGBM] [Info] Number of positive: 115, number of negative: 112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506608 -> initscore=0.026433
[LightGBM] [Info] Start training from score 0.026433
Processing Fold 4 ...


100%|██████████| 29/29 [00:01<00:00, 19.20it/s]


[LightGBM] [Info] Number of positive: 112, number of negative: 115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493392 -> initscore=-0.026433
[LightGBM] [Info] Start training from score -0.026433
Processing Fold 5 ...


100%|██████████| 29/29 [00:01<00:00, 18.89it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497797 -> initscore=-0.008811
[LightGBM] [Info] Start training from score -0.008811
Processing Fold 6 ...


100%|██████████| 29/29 [00:03<00:00,  9.25it/s]


[LightGBM] [Info] Number of positive: 115, number of negative: 112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506608 -> initscore=0.026433
[LightGBM] [Info] Start training from score 0.026433
Processing Fold 7 ...


100%|██████████| 29/29 [00:03<00:00,  8.05it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497797 -> initscore=-0.008811
[LightGBM] [Info] Start training from score -0.008811
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 16.14it/s]


[LightGBM] [Info] Number of positive: 113, number of negative: 114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497797 -> initscore=-0.008811
[LightGBM] [Info] Start training from score -0.008811
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 18.56it/s]


[LightGBM] [Info] Number of positive: 116, number of negative: 111
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511013 -> initscore=0.044060
[LightGBM] [Info] Start training from score 0.044060
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 18.56it/s]


[LightGBM] [Info] Number of positive: 112, number of negative: 115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 227, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493392 -> initscore=-0.026433
[LightGBM] [Info] Start training from score -0.026433
Processing Fold 1 ...


100%|██████████| 29/29 [00:01<00:00, 19.74it/s]


[LightGBM] [Info] Number of positive: 77, number of negative: 76
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503268 -> initscore=0.013072
[LightGBM] [Info] Start training from score 0.013072
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 19.46it/s]


[LightGBM] [Info] Number of positive: 73, number of negative: 80
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477124 -> initscore=-0.091567
[LightGBM] [Info] Start training from score -0.091567
Processing Fold 3 ...


100%|██████████| 29/29 [00:01<00:00, 19.95it/s]


[LightGBM] [Info] Number of positive: 79, number of negative: 74
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516340 -> initscore=0.065383
[LightGBM] [Info] Start training from score 0.065383
Processing Fold 4 ...


100%|██████████| 29/29 [00:02<00:00, 12.80it/s]


[LightGBM] [Info] Number of positive: 75, number of negative: 78
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490196 -> initscore=-0.039221
[LightGBM] [Info] Start training from score -0.039221
Processing Fold 5 ...


100%|██████████| 29/29 [00:02<00:00, 10.04it/s]


[LightGBM] [Info] Number of positive: 75, number of negative: 78
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490196 -> initscore=-0.039221
[LightGBM] [Info] Start training from score -0.039221
Processing Fold 6 ...


100%|██████████| 29/29 [00:02<00:00, 11.65it/s]


[LightGBM] [Info] Number of positive: 78, number of negative: 75
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.509804 -> initscore=0.039221
[LightGBM] [Info] Start training from score 0.039221
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 17.51it/s]


[LightGBM] [Info] Number of positive: 73, number of negative: 80
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 27
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477124 -> initscore=-0.091567
[LightGBM] [Info] Start training from score -0.091567
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 19.60it/s]


[LightGBM] [Info] Number of positive: 79, number of negative: 74
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516340 -> initscore=0.065383
[LightGBM] [Info] Start training from score 0.065383
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 20.77it/s]


[LightGBM] [Info] Number of positive: 76, number of negative: 77
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496732 -> initscore=-0.013072
[LightGBM] [Info] Start training from score -0.013072
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 20.27it/s]


[LightGBM] [Info] Number of positive: 80, number of negative: 73
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.522876 -> initscore=0.091567
[LightGBM] [Info] Start training from score 0.091567
Processing Fold 1 ...


100%|██████████| 29/29 [00:01<00:00, 20.55it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 68
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 138, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507246 -> initscore=0.028988
[LightGBM] [Info] Start training from score 0.028988
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 19.84it/s]


[LightGBM] [Info] Number of positive: 69, number of negative: 69
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 138, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Processing Fold 3 ...


100%|██████████| 29/29 [00:01<00:00, 18.56it/s]


[LightGBM] [Info] Number of positive: 69, number of negative: 69
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 138, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Processing Fold 4 ...


100%|██████████| 29/29 [00:02<00:00, 10.02it/s]


[LightGBM] [Info] Number of positive: 68, number of negative: 70
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 138, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492754 -> initscore=-0.028988
[LightGBM] [Info] Start training from score -0.028988
Processing Fold 5 ...


100%|██████████| 29/29 [00:03<00:00,  9.26it/s]


[LightGBM] [Info] Number of positive: 68, number of negative: 71
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.489209 -> initscore=-0.043172
[LightGBM] [Info] Start training from score -0.043172
Processing Fold 6 ...


100%|██████████| 29/29 [00:01<00:00, 16.45it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 69
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503597 -> initscore=0.014389
[LightGBM] [Info] Start training from score 0.014389
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 20.12it/s]


[LightGBM] [Info] Number of positive: 70, number of negative: 69
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503597 -> initscore=0.014389
[LightGBM] [Info] Start training from score 0.014389
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 19.99it/s]


[LightGBM] [Info] Number of positive: 68, number of negative: 71
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.489209 -> initscore=-0.043172
[LightGBM] [Info] Start training from score -0.043172
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 19.88it/s]


[LightGBM] [Info] Number of positive: 69, number of negative: 70
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496403 -> initscore=-0.014389
[LightGBM] [Info] Start training from score -0.014389
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 19.45it/s]


[LightGBM] [Info] Number of positive: 72, number of negative: 67
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13
[LightGBM] [Info] Number of data points in the train set: 139, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.517986 -> initscore=0.071973
[LightGBM] [Info] Start training from score 0.071973
Processing Fold 1 ...


100%|██████████| 29/29 [00:01<00:00, 19.89it/s]


[LightGBM] [Info] Number of positive: 56, number of negative: 62
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 118, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.474576 -> initscore=-0.101783
[LightGBM] [Info] Start training from score -0.101783
Processing Fold 2 ...


100%|██████████| 29/29 [00:01<00:00, 20.71it/s]


[LightGBM] [Info] Number of positive: 58, number of negative: 60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 118, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491525 -> initscore=-0.033902
[LightGBM] [Info] Start training from score -0.033902
Processing Fold 3 ...


100%|██████████| 29/29 [00:02<00:00, 11.45it/s]


[LightGBM] [Info] Number of positive: 60, number of negative: 59
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504202 -> initscore=0.016807
[LightGBM] [Info] Start training from score 0.016807
Processing Fold 4 ...


100%|██████████| 29/29 [00:03<00:00,  8.03it/s]


[LightGBM] [Info] Number of positive: 60, number of negative: 59
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504202 -> initscore=0.016807
[LightGBM] [Info] Start training from score 0.016807
Processing Fold 5 ...


100%|██████████| 29/29 [00:01<00:00, 16.56it/s]


[LightGBM] [Info] Number of positive: 58, number of negative: 61
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487395 -> initscore=-0.050431
[LightGBM] [Info] Start training from score -0.050431
Processing Fold 6 ...


100%|██████████| 29/29 [00:01<00:00, 19.86it/s]


[LightGBM] [Info] Number of positive: 59, number of negative: 60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495798 -> initscore=-0.016807
[LightGBM] [Info] Start training from score -0.016807
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 20.30it/s]


[LightGBM] [Info] Number of positive: 62, number of negative: 57
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521008 -> initscore=0.084083
[LightGBM] [Info] Start training from score 0.084083
Processing Fold 8 ...


100%|██████████| 29/29 [00:01<00:00, 18.13it/s]


[LightGBM] [Info] Number of positive: 59, number of negative: 60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495798 -> initscore=-0.016807
[LightGBM] [Info] Start training from score -0.016807
Processing Fold 9 ...


100%|██████████| 29/29 [00:01<00:00, 20.42it/s]


[LightGBM] [Info] Number of positive: 59, number of negative: 60
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495798 -> initscore=-0.016807
[LightGBM] [Info] Start training from score -0.016807
Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 20.51it/s]

[LightGBM] [Info] Number of positive: 63, number of negative: 56
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529412 -> initscore=0.117783
[LightGBM] [Info] Start training from score 0.117783





# Security requirements

In [74]:
grouped = global_results[0].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
LinearSVC                          0.87      0.87
ExtraTreesClassifier               0.87      0.87
CategoricalNB                      0.87      0.87
RandomForestClassifier             0.87      0.87
LogisticRegression                 0.87      0.87
BernoulliNB                        0.86      0.86
CalibratedClassifierCV             0.85      0.85
Perceptron                         0.85      0.85
PassiveAggressiveClassifier        0.85      0.85
AdaBoostClassifier                 0.85      0.85
RidgeClassifierCV                  0.85      0.85
RidgeClassifier                    0.84      0.84
LinearDiscriminantAnalysis         0.84      0.84
SGDClassifier                      0.84      0.84
NearestCentroid                    0.84      0.84
NuSVC                              0.83      0.83
XGBClassifier                      0.82      0.82
SVC                                0.81      0.81


# Usability requirements

In [75]:
grouped = global_results[1].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
CategoricalNB                      0.82      0.83
Perceptron                         0.82      0.82
SGDClassifier                      0.82      0.81
LogisticRegression                 0.81      0.81
NearestCentroid                    0.81      0.80
RidgeClassifierCV                  0.80      0.80
PassiveAggressiveClassifier        0.79      0.80
BernoulliNB                        0.79      0.79
SVC                                0.78      0.78
NuSVC                              0.78      0.78
CalibratedClassifierCV             0.78      0.78
RidgeClassifier                    0.77      0.77
RandomForestClassifier             0.77      0.77
LinearDiscriminantAnalysis         0.76      0.76
GaussianNB                         0.76      0.76
LinearSVC                          0.76      0.76
QuadraticDiscriminantAnalysis      0.74      0.74
ExtraTreesClassifier               0.74      0.74


# Operational requirements

In [76]:
grouped = global_results[2].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
CategoricalNB                      0.87      0.87
NearestCentroid                    0.81      0.81
BernoulliNB                        0.80      0.80
AdaBoostClassifier                 0.79      0.79
RidgeClassifierCV                  0.78      0.78
NuSVC                              0.78      0.78
LogisticRegression                 0.77      0.77
GaussianNB                         0.77      0.76
ExtraTreesClassifier               0.77      0.77
SVC                                0.75      0.75
RandomForestClassifier             0.74      0.74
RidgeClassifier                    0.74      0.73
PassiveAggressiveClassifier        0.73      0.73
CalibratedClassifierCV             0.73      0.73
SGDClassifier                      0.72      0.72
LinearSVC                          0.72      0.71
Perceptron                         0.72      0.71
LinearDiscriminantAnalysis         0.71      0.71


# Performance requirements

In [77]:
grouped = global_results[3].groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

# for key, item in grouped:
#     print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
ExtraTreesClassifier               0.84      0.84
CategoricalNB                      0.83      0.83
RandomForestClassifier             0.82      0.82
BernoulliNB                        0.82      0.82
NearestCentroid                    0.80      0.80
NuSVC                              0.79      0.79
XGBClassifier                      0.79      0.79
DecisionTreeClassifier             0.78      0.78
SVC                                0.77      0.77
LogisticRegression                 0.77      0.77
RidgeClassifierCV                  0.76      0.76
BaggingClassifier                  0.76      0.75
LGBMClassifier                     0.75      0.74
ExtraTreeClassifier                0.75      0.75
GaussianNB                         0.74      0.74
LinearSVC                          0.73      0.72
Perceptron                         0.73      0.72
PassiveAggressiveClassifier        0.72      0.73
