# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn import tree
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42

2024-12-12 17:01:25.867314: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-12 17:01:26.236284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734019286.377456   56517 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734019286.417072   56517 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 17:01:26.746509: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
dev_data = pd.read_csv('../../data/ml_datasets/oversampling/dev_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
testing_data = pd.read_csv('../../data/ml_datasets/oversampling/test_set.csv')

In [3]:
dev_label = dev_data.pop('label')
test_label = testing_data.pop('label')

dev_set = dev_data
dev_set['race_season%autumn'] = dev_set['race_season%autumn'].astype(int)
dev_set['race_season%spring'] = dev_set['race_season%spring'].astype(int)
dev_set['race_season%summer'] = dev_set['race_season%summer'].astype(int)
dev_set['race_season%winter'] = dev_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 8
USER = 'Jacopo'

In [4]:
dev_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 573950 entries, 453303 to 121958
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   length                       573950 non-null  float64
 1   cyclist_bmi                  573950 non-null  float64
 2   cyclist_age_group            573950 non-null  int64  
 3   climb_percentage             573950 non-null  float64
 4   race_physical_effort         573950 non-null  float64
 5   race_prestige                573950 non-null  float64
 6   previous_mean_position       573950 non-null  float64
 7   previous_mean_delta          573950 non-null  float64
 8   previous_mean_cp             573950 non-null  float64
 9   cyclist_previous_experience  573950 non-null  float64
 10  race_season%autumn           573950 non-null  int64  
 11  race_season%spring           573950 non-null  int64  
 12  race_season%summer           573950 non-null  int64  
 13 

## Decision Tree

In [24]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {"max_depth": [3, 5, 10, 15, 20, None],
              "max_features": sp_randint(3, len(dev_set.iloc[0]) + 1),
              "min_samples_split": [20, 30, 50, 100],
              "min_samples_leaf": [10, 20, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}]} # class weights are related to over/undersampling chosen
#define the number of iters
n_iter_search = 200 # Total-Iteration: 400
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [25]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_decision_tree_results.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,4.718205,0.116322,0.066384,0.006037,,gini,20,6,20,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.769941,0.767211,0.770764,0.770186,0.769687,0.769558,0.001226,1
1,7.864221,0.368792,0.067114,0.007817,balanced,gini,15,9,10,30,"{'class_weight': 'balanced', 'criterion': 'gin...",0.76621,0.771548,0.769093,0.769469,0.762214,0.767707,0.003231,2
11,4.582958,0.379503,0.054124,0.012172,,gini,15,7,20,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.746586,0.747104,0.742633,0.750681,0.746374,0.746676,0.002556,3
10,8.212434,0.131978,0.041943,0.004535,balanced,entropy,10,13,10,30,"{'class_weight': 'balanced', 'criterion': 'ent...",0.710613,0.705177,0.710442,0.712007,0.708564,0.709361,0.002361,4
3,3.380584,0.145474,0.050397,0.004474,,gini,10,6,100,30,"{'class_weight': None, 'criterion': 'gini', 'm...",0.70435,0.701955,0.706879,0.699936,0.698219,0.702268,0.003083,5


## SVM

In [26]:
#define the parameters' values you want to try
param_dist = {"C": sp_loguniform(1e-4, 1e2)}
#define the number of iters
n_iter_search = 50 # Total-Iteration: 100
#define the model
clf = LinearSVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [27]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_svm_results.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,10.546039,0.637382,0.045529,0.006342,0.002744,{'C': 0.002744023535579013},0.661214,0.656598,0.660915,0.659334,0.659234,0.659459,0.00164,1
7,11.289573,0.722306,0.049562,0.004089,0.169163,{'C': 0.16916283055859838},0.661302,0.656146,0.661021,0.659361,0.658747,0.659315,0.001856,2
9,8.172099,1.993295,0.029893,0.007193,0.032113,{'C': 0.032113036509577364},0.661198,0.65591,0.661125,0.659291,0.65893,0.659291,0.001927,3
3,10.063736,0.681037,0.048676,0.002149,0.041119,{'C': 0.041118676935830094},0.661154,0.65605,0.661099,0.659282,0.658695,0.659256,0.001876,4
4,9.871431,0.657744,0.043008,0.002558,1.11589,{'C': 1.115889685389457},0.660876,0.656085,0.661108,0.659422,0.658782,0.659255,0.001809,5


## Naive Bayes

In [12]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, #CrossValidation per confrontabilità, non model selection
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [13]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_naive_bayes_results.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.184915,0.028362,0.041469,0.016511,{},0.622823,0.625776,0.628748,0.62317,0.624071,0.624917,0.002171,1


## KNN

- Rischiamo che il mapping degli attributi categorici ordinali (senza one-hot) crei problemi nel K-NN

In [28]:
param_dist = {'n_neighbors': [5, 15, 25], # Jacopo
              #'n_neighbors': [40, 50], # Simone
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_dev_set = dev_set.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter'])
#define the model
clf = KNeighborsClassifier()
#define the grid search
rand_search = GridSearchCV(clf, param_grid=param_dist,
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(tmp_dev_set, dev_label);

In [29]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_knn_results.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,1.452134,0.140745,27.782911,0.584419,ball_tree,22,"{'algorithm': 'ball_tree', 'n_neighbors': 22}",0.663209,0.658517,0.660607,0.662303,0.659689,0.660865,0.001704,1
4,1.793398,0.077293,33.511456,0.540421,ball_tree,26,"{'algorithm': 'ball_tree', 'n_neighbors': 26}",0.655777,0.651345,0.65492,0.656566,0.653741,0.65447,0.001822,2
6,0.08834,0.014553,189.716327,1.626455,brute,25,"{'algorithm': 'brute', 'n_neighbors': 25}",0.654567,0.652546,0.654936,0.656247,0.653374,0.654334,0.00128,3
2,0.081664,0.010948,214.730393,20.522446,brute,29,"{'algorithm': 'brute', 'n_neighbors': 29}",0.649895,0.647049,0.648713,0.650648,0.6478,0.648821,0.001318,4
5,2.196515,0.204794,11.631383,0.073614,kd_tree,31,"{'algorithm': 'kd_tree', 'n_neighbors': 31}",0.647298,0.644601,0.647479,0.648676,0.646566,0.646924,0.001345,5


## Rule-Based

In [None]:
param_dist = {
    'prune_size': sp_uniform(0.1, 0.4),  # Distribuzione uniforme tra 0.1 e 0.5
    'k': sp_randint(1, 11)               # Interi tra 1 e 10
}

#define the number of iters
n_iter_search = 10
#define the model
clf = lw.RIPPER()
#define the grid search
rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=n_iter_search, 
                                 scoring='f1_macro', n_jobs=N_JOBS)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_rule_based_results.csv', index=False)
df.head()

## Random Forest

In [None]:
clf = RandomForestClassifier()

param_dist = {"max_depth": [5, 10, 20, None],
              "max_features": sp_randint(3, len(dev_set.iloc[0]) + 1),
              "min_samples_split": [20, 50, 100],
              "min_samples_leaf": [10, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}],
              "n_estimators": [50, 100, 150]}

n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_random_forest_results.csv', index=False)
df.head()

## XGBoost

In [None]:
clf = XGBClassifier()
param_dist = {
    "n_estimators": [25, 50, 100, 250, 500],  
    "max_depth": [2, 3, 4, 5],  
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001] 
}
n_iter_search = 20
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_xgb_results.csv', index=False)
df.head()

## AdaBoost

In [None]:
clf = AdaBoostClassifier()
param_dist = {
    "n_estimators": [25, 50, 100, 250, 500],  # Aggiunto 50 e 500 per una maggiore flessibilità
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001],  # Aggiunto 0.01 per esplorare un valore intermedio
    "algorithm": ['SAMME']  # Aggiunto 'SAMME.R' come opzione alternativa
}
n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_ada_boost_results.csv', index=False)
df.head()

## Neural Network

In [None]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configura l'ottimizzatore con il learning rate scelto
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_score", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[f1])
  
        return model
    
    def fit(self, hp, model, x, y, validation_data, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            validation_data=validation_data,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [None]:
# Parametri di KFold
dev_x = dev_set.to_numpy()
dev_y = dev_label.to_numpy()
kf = model_selection.KFold(n_splits=5, shuffle=False)
hyper_ae = MyHyperModel()
hp = HyperParameters()

rounds = 10
config_results = []

for _ in range(rounds):
    batch_size = hp.Choice("batch_size", [256, 512, 1024])  
    epochs = hp.Choice("epochs", [2, 3, 5, 10]) 
    units_layer1 = hp.Choice('units_layer1', [32, 64, 128, 256])  
    drop_rate = hp.Float('rate', 0, 0.5, step=0.05) 
    learning_rate = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)
    f1_scores = []

    for train_index, val_index in kf.split(dev_x, dev_y):
        x_train, x_val = dev_x[train_index], dev_x[val_index]
        y_train, y_val = dev_y[train_index], dev_y[val_index]
        y_val = y_val.reshape(-1, 1)
        y_train = y_train.reshape(-1, 1)
        
        # Adatta il modello con i dati di training e validazione 
        metrics = hyper_ae.fit(hp, model, x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size)
        f1_scores.append(metrics.history['val_f1_score'][-1])
    
    mean_f1, std_f1 = mean(f1_scores), stdev(f1_scores)
    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "mean_f1": mean_f1,
        "std_f1": std_f1
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
df.sort_values(by='mean_f1', inplace=True, ascending=False)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_nn_results.csv', index=False)
    