# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import random
import numpy as np

from sklearn import tree
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42

2024-12-15 11:13:43.581418: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-15 11:13:43.582451: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-15 11:13:43.586216: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-15 11:13:43.597279: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734257623.611777  359477 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734257623.61

In [2]:
train_data = pd.read_csv('../../data/ml_datasets/oversampling/train_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
val_data = pd.read_csv('../../data/ml_datasets/oversampling/val_set.csv')
testing_data = pd.read_csv('../../data/ml_datasets/oversampling/test_set.csv')

In [3]:
train_label = train_data.pop('label')
val_label = val_data.pop('label')
test_label = testing_data.pop('label')

train_set = train_data
train_set['race_season%autumn'] = train_set['race_season%autumn'].astype(int)
train_set['race_season%spring'] = train_set['race_season%spring'].astype(int)
train_set['race_season%summer'] = train_set['race_season%summer'].astype(int)
train_set['race_season%winter'] = train_set['race_season%winter'].astype(int)

val_set = val_data
val_set['race_season%autumn'] = val_set['race_season%autumn'].astype(int)
val_set['race_season%spring'] = val_set['race_season%spring'].astype(int)
val_set['race_season%summer'] = val_set['race_season%summer'].astype(int)
val_set['race_season%winter'] = val_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 8
USER = 'Simone'

In [4]:
N_FEATURES = len(train_set.iloc[0])

In [5]:
train_set = train_set.to_numpy()
train_label = train_label.to_numpy()

val_set = val_set.to_numpy()
val_label = val_label.to_numpy()

In [6]:
split_index = np.concatenate([
    np.full(len(train_set), -1),  # -1 per training
    np.zeros(len(val_set))   # 0 per validation
])

X_combined = np.vstack((train_set, val_set))
y_combined = np.concatenate((train_label, val_label))

ps = PredefinedSplit(test_fold=split_index)

In [7]:
#define the parameters' values you want to try
def f1_class_scorer(class_index):
    def score_function(y_true, y_pred):
        # Calcola F1 per ciascuna classe e ritorna quella specificata
        return f1_score(y_true, y_pred, average=None)[class_index]
    return make_scorer(score_function)

# Scorer per la classe 0 e 1
f1_class_0 = f1_class_scorer(0)  # Classe 0
f1_class_1 = f1_class_scorer(1)  # Classe 1


scoring={
        'f1_macro': 'f1_macro',   # F1 macro per entrambe le classi
        'f1_0': f1_class_0,  # F1 solo per classe 0
        'f1_1': f1_class_1   # F1 solo per classe 1
    }

## Decision Tree

In [8]:
param_dist = {"max_depth": [3, 5, 10, 15, 20, None],
              "max_features": sp_randint(3, N_FEATURES + 1),
              "min_samples_split": [20, 30, 50, 100],
              "min_samples_leaf": [10, 20, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}]} # class weights are related to over/undersampling chosen
#define the number of iters
n_iter_search = 200 # Total-Iteration: 400
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
#run the grid search
rand_search.fit(X_combined, y_combined);

In [9]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_decision_tree_results.csv', index=False)
df.head()[['mean_test_f1_macro', 'mean_test_f1_0', 'mean_test_f1_1', 'rank_test_f1_macro']]

Unnamed: 0,mean_test_f1_macro,mean_test_f1_0,mean_test_f1_1,rank_test_f1_macro
139,0.63391,0.870124,0.397696,1
61,0.632964,0.868205,0.397723,2
103,0.631756,0.867465,0.396046,3
176,0.631488,0.914229,0.348748,4
111,0.631374,0.872311,0.390438,5


## SVM

In [10]:
#define the parameters' values you want to try
param_dist = {"C": sp_loguniform(1e-4, 1e2)}
#define the number of iters
n_iter_search = 50 # Total-Iteration: 100
#define the model
clf = LinearSVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
#run the grid search
rand_search.fit(X_combined, y_combined);

In [11]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_svm_results.csv', index=False)

## Naive Bayes

In [12]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, #CrossValidation per confrontabilità, non model selection
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
#run the grid search
rand_search.fit(X_combined, y_combined);

In [13]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_naive_bayes_results.csv', index=False)

## KNN

- Rischiamo che il mapping degli attributi categorici ordinali (senza one-hot) crei problemi nel K-NN

In [21]:
param_dist = {# 'n_neighbors': [5, 15, 25], # Jacopo
              'n_neighbors': [40, 50], # Simone
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_train_set = train_data.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter']).to_numpy()
tmp_val_set = val_data.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter']).to_numpy()

split_index_knn = np.concatenate([
    np.full(len(tmp_train_set), -1),  # -1 per training
    np.zeros(len(tmp_val_set))   # 0 per validation
])

X_combined_knn = np.vstack((tmp_train_set, tmp_val_set))
y_combined_knn = np.concatenate((train_label, val_label))

ps_knn = PredefinedSplit(test_fold=split_index_knn)

clf = KNeighborsClassifier()
#define the grid search
rand_search = GridSearchCV(clf, param_grid=param_dist,
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps_knn)
#run the grid search
rand_search.fit(X_combined_knn, y_combined_knn);

In [22]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_knn_results.csv', index=False)

## Random Forest

In [23]:
clf = RandomForestClassifier()

param_dist = {"max_depth": [5, 10, 20, None],
              "max_features": sp_randint(3, N_FEATURES + 1),
              "min_samples_split": [20, 50, 100],
              "min_samples_leaf": [10, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}],
              "n_estimators": [50, 100, 150]}

n_iter_search = 50 # Total-Iteration: 100
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [24]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_random_forest_results.csv', index=False)

## XGBoost

In [25]:
clf = XGBClassifier()
param_dist = {
    #"n_estimators": [25, 50, 100], #Jacopo
    "n_estimators": [250, 500],  # Simone
    "max_depth": [2, 3, 4, 5],  
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist,  
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [26]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_xgb_results.csv', index=False)

## AdaBoost

In [27]:
clf = AdaBoostClassifier()
param_dist = {
    # "n_estimators": [25, 50, 100], #Jacopo  
    "n_estimators": [250, 500], # Simone 
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001],  
    "algorithm": ['SAMME'] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [28]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_ada_boost_results.csv', index=False)

## Neural Network

In [None]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configura l'ottimizzatore con il learning rate scelto
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_macro", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[f1])
  
        return model
    
    def fit(self, hp, model, x, y, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [None]:
rounds = 50
config_results = []

for _ in range(rounds):
    hp = HyperParameters()
    hyper_ae = MyHyperModel()
    batch_size = hp.Fixed("batch_size", random.choice([256, 512, 1024])) 
    epochs = hp.Fixed("epochs", random.choice([10, 20, 30])) 
    units_layer1 = hp.Fixed('units_layer1', random.choice([32, 64, 128, 256]))
    drop_rate = hp.Fixed('rate', random.choice(np.arange(0., 0.9, 0.2))) 
    #learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-5, -3.5, num=10))) # Jacopo
    learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-3.5, -2, num=10))) # Simone

    print(f"Training with batch_size={batch_size}, epochs={epochs}, units_layer1={units_layer1}, drop_rate={drop_rate}, learning_rate={learning_rate}")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)

    y_val = val_label.reshape(-1, 1)
    y_train = train_label.reshape(-1, 1)


    metrics = hyper_ae.fit(hp, model, train_set, y_train, epochs=epochs, batch_size=batch_size)

    val_out = model.predict(val_set, verbose=False)
    val_out = (val_out >= 0.5).astype(int)
    f1_0 = f1_class_0._score_func(y_val, val_out)
    f1_1 = f1_class_1._score_func(y_val, val_out)
    f1_macro = (f1_0 + f1_1)/2

    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "f1_macro": f1_macro,
        "f1_0": f1_0,
        "f1_1": f1_1,
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
df.sort_values(by='f1_macro', inplace=True, ascending=False)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_nn_results.csv', index=False) 
    

Training with batch_size=512, epochs=10, units_layer1=128, drop_rate=0.2, learning_rate=0.0006812920690579615


W0000 00:00:1734175418.308142     692 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Training with batch_size=256, epochs=10, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=0.0006812920690579615
Training with batch_size=1024, epochs=20, units_layer1=256, drop_rate=0.0, learning_rate=0.001
Training with batch_size=1024, epochs=20, units_layer1=256, drop_rate=0.0, learning_rate=0.0031622776601683794
Training with batch_size=1024, epochs=10, units_layer1=64, drop_rate=0.6000000000000001, learning_rate=0.004641588833612777
Training with batch_size=256, epochs=10, units_layer1=64, drop_rate=0.6000000000000001, learning_rate=0.001
Training with batch_size=256, epochs=10, units_layer1=256, drop_rate=0.8, learning_rate=0.0031622776601683794
Training with batch_size=512, epochs=20, units_layer1=64, drop_rate=0.6000000000000001, learning_rate=0.0014677992676220691
Training with batch_size=512, epochs=10, units_layer1=128, drop_rate=0.6000000000000001, learning_rate=0.00046415888336127773
Training with batch_size=256, epochs=10, units_layer1=256, drop_rate=0.8, lea

## Rule-Based

In [None]:
param_dist = {
    'prune_size': sp_uniform(0.1, 0.4),  # Distribuzione uniforme tra 0.1 e 0.5
    'k': sp_randint(1, 11)               # Interi tra 1 e 10
}

#define the number of iters
n_iter_search = 20
#define the model
clf = lw.RIPPER(
    verbosity=2,         # Detailed logging for debugging
    max_rules=10,        # Moderate rule complexity
    max_rule_conds=7,    # Enough room for moderately complex conditions
    max_total_conds=35   # Cap total conditions to avoid runaway complexity
)
#define the grid search
rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=n_iter_search, 
                                 scoring=scoring, 
                                 refit=False, 
                                 n_jobs=N_JOBS,
                                 cv=ps)
#run the grid search
rand_search.fit(X_combined, y_combined);

discretizing 9 features: [0, 1, 3, 4, 5, 6, 7, 8, 9]

growing ruleset...
initial model: []

pos_growset 125797 pos_pruneset 103093
neg_growset 125797 neg_pruneset 103093
grew rule: [6=<0.29^4=0.27-0.38^7=<165.85^3=0.02-0.024^0=160.0-168.6^1=<19.54^11=0.0]
pruned rule: [6=<0.29^4=0.27-0.38^7=<165.85^3=0.02-0.024^0=160.0-168.6^1=<19.54]
updated ruleset: [[6=<0.29^4=0.27-0.38^7=<165.85^3=0.02-0.024^0=160.0-168.6^1=<19.54]]

pos_growset 125726 pos_pruneset 103034
neg_growset 125795 neg_pruneset 103091
grew rule: [6=<0.29^4=>0.38^2=1.0^12=0.0^3=>0.024^7=<165.85]
pruned rule: [6=<0.29^4=>0.38^2=1.0^12=0.0^3=>0.024]
updated ruleset: [[6=<0.29^4=0.27-0.38^7=<165.85^3=0.02-0.024^0=160.0-168.6^1=<19.54] V [6=<0.29^4=>0.38^2=1.0^12=0.0^3=>0.024]]


GREW INITIAL RULESET:
[[6=<0.29 ^ 4=0.27-0.38 ^ 7=<165.85 ^ 3=0.02-0.024 ^ 0=160.0-168.6 ^ 1=<19.54] V
[6=<0.29 ^ 4=>0.38 ^ 2=1.0 ^ 12=0.0 ^ 3=>0.024]]

optimization run 1 of 3
optimizing ruleset...

grew rule: [6=<0.29^4=>0.38^2=1.0^5=0.151-0.244]
gre

In [11]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_rule_based_results.csv', index=False)