# Mount drive

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Install dependencies

In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


# Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import gensim.downloader
from lazypredict.Supervised import LazyClassifier

# SEED for reproducibility
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
X_column = 'RequirementText'
y_column = 'Class'

CLASSICAL_PATH = '/content/drive/MyDrive/Tesi/Master-Thesis/data/processed/quantum.csv'

df = pd.read_csv(CLASSICAL_PATH, index_col=False)

In [None]:
y = df[y_column].map({cls: idx for idx, cls in enumerate(df[y_column].unique())})

# Vectorize train set

In [None]:
def getTrainSet(df, vectorization_technique='BoW'):
    if vectorization_technique == 'BoW':
        countvec = CountVectorizer(max_features=100)
        vectors = countvec.fit_transform(df[X_column]).toarray()
    elif vectorization_technique == 'TFIDF':
        countvec = CountVectorizer(max_features=100)
        bow = countvec.fit_transform(df[X_column]).toarray()
        tfidfconverter = TfidfTransformer()
        vectors = tfidfconverter.fit_transform(bow).toarray()
    elif vectorization_technique == 'Word2Vec':
        w2v_pretrained = gensim.downloader.load('word2vec-google-news-300')
        vectors = []
        for msg in df[X_column]:
            words = msg.split()
            vecs = [w2v_pretrained[word][:300] for word in words if word in w2v_pretrained]
            vec_avg = sum(vecs) / len(vecs) if vecs else [0] * 300
            vectors.append(vec_avg)
        vectors = pd.DataFrame(vectors)
        vectors.columns = vectors.columns.astype(str)
    else:
        raise ValueError("Invalid vectorization technique. Choose from 'BoW', 'TFIDF', or 'Word2Vec'.")
    return pd.DataFrame(vectors)

vect_tech = 'BoW'

X = getTrainSet(df, vect_tech)

# Training

In [None]:
results = pd.DataFrame(columns=["Fold","Model","Accuracy","F1-Score"])
cv = KFold(n_splits=10, random_state=SEED, shuffle=True)
foldcounter = 1

for train_index, test_index in cv.split(X, y):
    print("Processing Fold "+ str(foldcounter) + " ...")
    X_train, X_test, y_train, y_test = \
    X[X.index.isin(train_index)], X[X.index.isin(test_index)], y[train_index], y[test_index]
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
    for model in models[:].iterrows():
        result = pd.DataFrame({
            "Fold": [foldcounter],
            "Model": [model[0]],
            "Accuracy": [round(model[1][0], 3)],
            "F1-Score": [round(model[1][3], 3)]})
        results = results.append(result)
    foldcounter += 1

Processing Fold 1 ...


100%|██████████| 29/29 [00:02<00:00, 10.37it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 21
[LightGBM] [Info] Start training from score -1.701608
[LightGBM] [Info] Start training from score -1.418246
[LightGBM] [Info] Start training from score -1.034664
[LightGBM] [Info] Start training from score -1.513556





Processing Fold 2 ...


100%|██████████| 29/29 [00:02<00:00, 12.23it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 20
[LightGBM] [Info] Start training from score -1.667707
[LightGBM] [Info] Start training from score -1.431318
[LightGBM] [Info] Start training from score -1.034664
[LightGBM] [Info] Start training from score -1.527945
Processing Fold 3 ...



100%|██████████| 29/29 [00:01<00:00, 17.72it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 23
[LightGBM] [Info] Start training from score -1.667707
[LightGBM] [Info] Start training from score -1.499372
[LightGBM] [Info] Start training from score -1.008461
[LightGBM] [Info] Start training from score -1.499372





Processing Fold 4 ...


100%|██████████| 29/29 [00:01<00:00, 17.86it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 318, number of used features: 22
[LightGBM] [Info] Start training from score -1.634917
[LightGBM] [Info] Start training from score -1.457986
[LightGBM] [Info] Start training from score -1.043553
[LightGBM] [Info] Start training from score -1.513556
Processing Fold 5 ...



100%|██████████| 29/29 [00:01<00:00, 17.05it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] Start training from score -1.638057
[LightGBM] [Info] Start training from score -1.383164
[LightGBM] [Info] Start training from score -1.037803
[LightGBM] [Info] Start training from score -1.606308
Processing Fold 6 ...



100%|██████████| 29/29 [00:01<00:00, 18.07it/s]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 22
[LightGBM] [Info] Start training from score -1.622056
[LightGBM] [Info] Start training from score -1.434458
[LightGBM] [Info] Start training from score -1.046692
[LightGBM] [Info] Start training from score -1.545683
Processing Fold 7 ...


100%|██████████| 29/29 [00:01<00:00, 17.96it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] Start training from score -1.739839
[LightGBM] [Info] Start training from score -1.395743
[LightGBM] [Info] Start training from score -1.055661
[LightGBM] [Info] Start training from score -1.474732





Processing Fold 8 ...


100%|██████████| 29/29 [00:02<00:00, 12.60it/s]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] Start training from score -1.670847
[LightGBM] [Info] Start training from score -1.408482
[LightGBM] [Info] Start training from score -1.055661
[LightGBM] [Info] Start training from score -1.516696
Processing Fold 9 ...


100%|██████████| 29/29 [00:02<00:00, 10.48it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 21
[LightGBM] [Info] Start training from score -1.739839
[LightGBM] [Info] Start training from score -1.395743
[LightGBM] [Info] Start training from score -1.028993
[LightGBM] [Info] Start training from score -1.516696





Processing Fold 10 ...


100%|██████████| 29/29 [00:01<00:00, 17.65it/s]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 319, number of used features: 20
[LightGBM] [Info] Start training from score -1.722140
[LightGBM] [Info] Start training from score -1.447703
[LightGBM] [Info] Start training from score -0.986068
[LightGBM] [Info] Start training from score -1.545683


# Results

In [None]:
grouped = results.groupby("Model")
print(grouped[["Accuracy", "F1-Score"]].mean().sort_values(by="Accuracy", ascending=False), "\n\n")

for key, item in grouped:
    print(grouped.get_group(key), "\n\n")

                               Accuracy  F1-Score
Model                                            
NearestCentroid                    0.73      0.73
BernoulliNB                        0.72      0.72
XGBClassifier                      0.72      0.71
SVC                                0.71      0.71
RidgeClassifierCV                  0.71      0.71
NuSVC                              0.71      0.70
ExtraTreesClassifier               0.70      0.70
LinearDiscriminantAnalysis         0.70      0.70
RandomForestClassifier             0.70      0.69
SGDClassifier                      0.69      0.70
RidgeClassifier                    0.69      0.69
Perceptron                         0.69      0.69
CalibratedClassifierCV             0.69      0.68
LogisticRegression                 0.68      0.69
DecisionTreeClassifier             0.67      0.67
CategoricalNB                      0.67      0.65
PassiveAggressiveClassifier        0.66      0.66
BaggingClassifier                  0.65      0.65
