# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [4]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import random
import numpy as np

from sklearn import tree
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

import os
import sys
sys.path.append(os.path.abspath('../../src'))
from utils import *

In [5]:
RANDOM_STATE = 42
dev_data = pd.read_csv('../../data/ml_datasets/undersampling/dev_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
testing_data = pd.read_csv('../../data/ml_datasets/undersampling/test_set.csv')

In [6]:
dev_label = dev_data.pop('label')
test_label = testing_data.pop('label')

dev_set = dev_data
dev_set['race_season%autumn'] = dev_set['race_season%autumn'].astype(int)
dev_set['race_season%spring'] = dev_set['race_season%spring'].astype(int)
dev_set['race_season%summer'] = dev_set['race_season%summer'].astype(int)
dev_set['race_season%winter'] = dev_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 8
USER = 'Jacopo'
RUS = RandomUnderSampler(random_state=RANDOM_STATE)

In [7]:
#define the parameters' values you want to try
def f1_class_scorer(class_index):
    def score_function(y_true, y_pred):
        # Calcola F1 per ciascuna classe e ritorna quella specificata
        return f1_score(y_true, y_pred, average=None)[class_index]
    return make_scorer(score_function)

# Scorer per la classe 0 e 1
f1_class_0 = f1_class_scorer(0)  # Classe 0
f1_class_1 = f1_class_scorer(1)  # Classe 1


scoring={
        'f1_macro': 'f1_macro',   # F1 macro per entrambe le classi
        'f1_0': f1_class_0,  # F1 solo per classe 0
        'f1_1': f1_class_1   # F1 solo per classe 1
    }

## Decision Tree

In [9]:
param_dist = {"classifier__max_depth": [3, 5, 10, 15, 20, None],
              "classifier__max_features": sp_randint(3, len(dev_set.iloc[0]) + 1),
              "classifier__min_samples_split": [20, 30, 50, 100],
              "classifier__min_samples_leaf": [10, 20, 30, 50, 100],
              "classifier__criterion": ["entropy", "gini"],
              "classifier__class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}]} # class weights are related to over/undersampling chosen
#define the number of iters
n_iter_search = 1 # Total-Iteration: 400
#define the model
clf = tree.DecisionTreeClassifier()

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

#define the grid search
rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [10]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_decision_tree_results.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_depth,param_classifier__max_features,param_classifier__min_samples_leaf,param_classifier__min_samples_split,...,std_test_f1_0,rank_test_f1_0,split0_test_f1_1,split1_test_f1_1,split2_test_f1_1,split3_test_f1_1,split4_test_f1_1,mean_test_f1_1,std_test_f1_1,rank_test_f1_1
0,0.724355,0.037454,0.05451,0.004465,balanced,entropy,20,4,10,100,...,0.003437,1,0.380516,0.387482,0.383635,0.386537,0.381024,0.383839,0.002813,1


## SVM

In [26]:
#define the parameters' values you want to try
param_dist = {"classifier__C": sp_loguniform(1e-4, 1e2)}
#define the number of iters
n_iter_search = 50 # Total-Iteration: 100
#define the model
clf = LinearSVC()

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

#define the grid search
rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_svm_results.csv', index=False)

## Naive Bayes

In [12]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

#define the grid search
rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, # Only yo confront on the different folds, not for model selection
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_naive_bayes_results.csv', index=False)

## KNN

- Rimuoviamo gli attributi categorici anche se codificati (potrebbero peggiorare i risultati del K-NN)

In [28]:
param_dist = {'classifier__n_neighbors': [5, 15, 25], # Jacopo
              #'classifier__n_neighbors': [40, 50], # Simone
              'classifier__algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_dev_set = dev_set.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter'])
#define the model
clf = KNeighborsClassifier()
pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

rand_search = GridSearchCV(pipeline, param_grid=param_dist,
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
#run the grid search
rand_search.fit(tmp_dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_knn_results.csv', index=False)

## Random Forest

In [15]:
clf = RandomForestClassifier()

param_dist = {"classifier__max_depth": [5, 10, 20, None],
              "classifier__max_features": sp_randint(3, len(dev_set.iloc[0]) + 1),
              "classifier__min_samples_split": [20, 50, 100],
              "classifier__min_samples_leaf": [10, 30, 50, 100],
              "classifier__criterion": ["entropy", "gini"],
              "classifier__class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}],
              "classifier__n_estimators": [50, 100, 150]}

n_iter_search = 50 # Total-Iteration: 100
pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

#define the grid search
rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_random_forest_results.csv', index=False)

## XGBoost

In [None]:
clf = XGBClassifier()
param_dist = {
    "classifier__n_estimators": [25, 50, 100], #Jacopo
    # "classifier__n_estimators": [250, 500],  Simone
    "classifier__max_depth": [2, 3, 4, 5],  
    "classifier__learning_rate": [1, 0.1, 0.01, 0.001, 0.0001] 
}

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rand_search = GridSearchCV(pipeline, param_grid=param_dist,  
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_xgb_results.csv', index=False)

## AdaBoost

In [18]:
clf = AdaBoostClassifier()
param_dist = {
    "classifier__n_estimators": [25, 50, 100], # Jacopo  
    # "classifier__n_estimators": [250, 500], Simone 
    "classifier__learning_rate": [1, 0.1, 0.01, 0.001, 0.0001],  
    "classifier__algorithm": ['SAMME'] 
}

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rand_search = GridSearchCV(pipeline, param_grid=param_dist,  
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_ada_boost_results.csv', index=False)

## Neural Network

In [14]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configura l'ottimizzatore con il learning rate scelto
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        #f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_macro", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy')
  
        return model
    
    def fit(self, hp, model, x, y, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [15]:
# Parametri di KFold
dev_x = dev_set.to_numpy()
dev_y = dev_label.to_numpy()
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


rounds = 1
config_results = []

for _ in range(rounds):
    hp = HyperParameters()
    hyper_ae = MyHyperModel()
    batch_size = hp.Fixed("batch_size", random.choice([256, 512, 1024])) 
    epochs = hp.Fixed("epochs", random.choice([1])) 
    units_layer1 = hp.Fixed('units_layer1', random.choice([32, 64, 128, 256]))  
    drop_rate = hp.Fixed('rate', random.choice(np.arange(0., 0.9, 0.2))) 
    learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-5, -3.5, num=10))) # Jacopo
    #learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-3.5, -2, num=10))) Simone

    print(f"Training with batch_size={batch_size}, epochs={epochs}, units_layer1={units_layer1}, drop_rate={drop_rate}, learning_rate={learning_rate}")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)
    f1_macro_scores = []
    f1_0_scores = []
    f1_1_scores = []

    for train_index, val_index in skf.split(dev_x, dev_y):
        x_train, x_val = dev_x[train_index], dev_x[val_index]
        y_train, y_val = dev_y[train_index], dev_y[val_index]
        y_val = y_val.reshape(-1, 1)
        y_train = y_train.reshape(-1, 1)

        x_train, y_train = RUS.fit_resample(x_train, y_train)
        
        # Adatta il modello con i dati di training e validazione 
        metrics = hyper_ae.fit(hp, model, x_train, y_train, epochs=epochs, batch_size=batch_size)
        
        val_out = model.predict(x_val, verbose=False)
        val_out = (val_out >= 0.5).astype(int)
        f1_0_scores.append(f1_class_0._score_func(y_val, val_out))
        f1_1_scores.append(f1_class_1._score_func(y_val, val_out))
        f1_macro_scores.append((f1_0_scores[-1] + f1_1_scores[-1])/2)
    
    mean_f1_macro, std_f1_macro = mean(f1_macro_scores), stdev(f1_macro_scores)
    mean_f1_0, std_f1_0 = mean(f1_0_scores), stdev(f1_0_scores)
    mean_f1_1, std_f1_1 = mean(f1_1_scores), stdev(f1_1_scores)
    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "mean_f1_macro": mean_f1_macro,
        "std_f1_macro": std_f1_macro,
        "mean_f1_0": mean_f1_0,
        "std_f1_0": std_f1_0,
        "mean_f1_1": mean_f1_1,
        "std_f1_1": std_f1_1
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
df.sort_values(by='mean_f1_macro', inplace=True, ascending=False)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_nn_results.csv', index=False)
    

Training with batch_size=256, epochs=1, units_layer1=64, drop_rate=0.8, learning_rate=0.00021544346900318823


I0000 00:00:1734276937.477647  143299 service.cc:148] XLA service 0x7f0150009800 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734276937.479768  143299 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2024-12-15 16:35:37.565502: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1734276937.821863  143299 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1734276939.066499  143299 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


## Rule-Based

In [None]:
param_dist = {
    'classifier__prune_size': sp_uniform(0.1, 0.4),  # Distribuzione uniforme tra 0.1 e 0.5
    'classifier__k': sp_randint(1, 11)               # Interi tra 1 e 10
}

#define the number of iters
n_iter_search = 20
#define the model
clf = lw.RIPPER(
    max_rules=10,        # Moderate rule complexity
    max_rule_conds=7,    # Enough room for moderately complex conditions
    max_total_conds=35   # Cap total conditions to avoid runaway complexity
)
pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

#define the grid search
rand_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=skf)
#run the grid search
rand_search.fit(dev_set, dev_label);

discretizing 9 features: ['length', 'cyclist_bmi', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'previous_mean_position', 'previous_mean_delta', 'previous_mean_cp', 'cyclist_previous_experience']

discretizing 9 features: ['length', 'cyclist_bmi', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'previous_mean_position', 'previous_mean_delta', 'previous_mean_cp', 'cyclist_previous_experience']

growing ruleset...
initial model: []

pos_growset 26889 pos_pruneset 14053
neg_growset 26889 neg_pruneset 14053
grew rule: [previous_mean_position=<0.28^race_physical_effort=>0.4^previous_mean_delta=<160.55^race_season%spring=1^previous_mean_cp=0.0072-0.012]
pruned rule: [previous_mean_position=<0.28^race_physical_effort=>0.4^previous_mean_delta=<160.55^race_season%spring=1]
updated ruleset: [[previous_mean_position=<0.28^race_physical_effort=>0.4^previous_mean_delta=<160.55^race_season%spring=1]]

growing ruleset...
initial model: []

pos_growset 26889 pos_pruneset 1

In [18]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_rule_based_results.csv', index=False)