# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import random
import numpy as np

from sklearn import tree
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42

In [9]:
train_data = pd.read_csv('../../data/ml_datasets/oversampling/train_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
val_data = pd.read_csv('../../data/ml_datasets/oversampling/val_set.csv')
testing_data = pd.read_csv('../../data/ml_datasets/oversampling/test_set.csv')

In [10]:
train_label = train_data.pop('label')
test_label = testing_data.pop('label')

train_set = train_data
train_set['race_season%autumn'] = train_set['race_season%autumn'].astype(int)
train_set['race_season%spring'] = train_set['race_season%spring'].astype(int)
train_set['race_season%summer'] = train_set['race_season%summer'].astype(int)
train_set['race_season%winter'] = train_set['race_season%winter'].astype(int)

val_set = val_data
val_set['race_season%autumn'] = val_set['race_season%autumn'].astype(int)
val_set['race_season%spring'] = val_set['race_season%spring'].astype(int)
val_set['race_season%summer'] = val_set['race_season%summer'].astype(int)
val_set['race_season%winter'] = val_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 8
USER = 'Jacopo'

In [None]:
train_set.info()

In [12]:
#define the parameters' values you want to try
def f1_class_scorer(class_index):
    def score_function(y_true, y_pred):
        # Calcola F1 per ciascuna classe e ritorna quella specificata
        return f1_score(y_true, y_pred, average=None)[class_index]
    return make_scorer(score_function)

# Scorer per la classe 0 e 1
f1_class_0 = f1_class_scorer(0)  # Classe 0
f1_class_1 = f1_class_scorer(1)  # Classe 1


scoring={
        'f1_macro': 'f1_macro',   # F1 macro per entrambe le classi
        'f1_0': f1_class_0,  # F1 solo per classe 0
        'f1_1': f1_class_1   # F1 solo per classe 1
    }

## Decision Tree

In [31]:
param_dist = {"max_depth": [3, 5, 10, 15, 20, None],
              "max_features": sp_randint(3, len(train_set.iloc[0]) + 1),
              "min_samples_split": [20, 30, 50, 100],
              "min_samples_leaf": [10, 20, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}]} # class weights are related to over/undersampling chosen
#define the number of iters
n_iter_search = 200 # Total-Iteration: 400
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
#run the grid search
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_decision_tree_results.csv', index=False)
df.head()[['mean_test_f1_macro', 'mean_test_f1_0', 'mean_test_f1_1', 'rank_test_f1_macro']]

## SVM

In [26]:
#define the parameters' values you want to try
param_dist = {"C": sp_loguniform(1e-4, 1e2)}
#define the number of iters
n_iter_search = 50 # Total-Iteration: 100
#define the model
clf = LinearSVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
#run the grid search
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_svm_results.csv', index=False)

## Naive Bayes

In [12]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, #CrossValidation per confrontabilità, non model selection
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring=scoring,
                            refit=False)
#run the grid search
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_naive_bayes_results.csv', index=False)

## KNN

- Rischiamo che il mapping degli attributi categorici ordinali (senza one-hot) crei problemi nel K-NN

In [28]:
param_dist = {'n_neighbors': [5, 15, 25], # Jacopo
              #'n_neighbors': [40, 50], # Simone
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_train_set = train_set.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter'])
#define the model
clf = KNeighborsClassifier()
#define the grid search
rand_search = GridSearchCV(clf, param_grid=param_dist,
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
#run the grid search
rand_search.fit(tmp_train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_knn_results.csv', index=False)

## Random Forest

In [15]:
clf = RandomForestClassifier()

param_dist = {"max_depth": [5, 10, 20, None],
              "max_features": sp_randint(3, len(train_set.iloc[0]) + 1),
              "min_samples_split": [20, 50, 100],
              "min_samples_leaf": [10, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}],
              "n_estimators": [50, 100, 150]}

n_iter_search = 50 # Total-Iteration: 100
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_random_forest_results.csv', index=False)

## XGBoost

In [None]:
clf = XGBClassifier()
param_dist = {
    "n_estimators": [25, 50, 100], #Jacopo
    # "n_estimators": [250, 500],  Simone
    "max_depth": [2, 3, 4, 5],  
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist,  
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_xgb_results.csv', index=False)

## AdaBoost

In [18]:
clf = AdaBoostClassifier()
param_dist = {
    "n_estimators": [25, 50, 100], # Jacopo  
    # "n_estimators": [250, 500], Simone 
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001],  
    "algorithm": ['SAMME'] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False)
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_ada_boost_results.csv', index=False)

## Neural Network

In [15]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configura l'ottimizzatore con il learning rate scelto
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_macro", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[f1])
  
        return model
    
    def fit(self, hp, model, x, y, validation_data, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            validation_data=validation_data,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [None]:
# Parametri di KFold
dev_x = train_set.to_numpy()
dev_y = train_label.to_numpy()
kf = model_selection.KFold(n_splits=5, shuffle=False)


rounds = 50
config_results = []

for _ in range(rounds):
    hp = HyperParameters()
    hyper_ae = MyHyperModel()
    batch_size = hp.Fixed("batch_size", random.choice([256, 512, 1024])) 
    epochs = hp.Fixed("epochs", random.choice([5, 10])) 
    units_layer1 = hp.Fixed('units_layer1', random.choice([32, 64, 128, 256]))  
    drop_rate = hp.Fixed('rate', random.choice(np.arange(0., 0.9, 0.2))) 
    learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-5, -3.5, num=10))) # Jacopo
    #learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-3.5, -2, num=10))) Simone

    print(f"Training with batch_size={batch_size}, epochs={epochs}, units_layer1={units_layer1}, drop_rate={drop_rate}, learning_rate={learning_rate}")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)
    f1_macro_scores = []
    f1_0_scores = []
    f1_1_scores = []

    for train_index, val_index in kf.split(dev_x, dev_y):
        x_train, x_val = dev_x[train_index], dev_x[val_index]
        y_train, y_val = dev_y[train_index], dev_y[val_index]
        y_val = y_val.reshape(-1, 1)
        y_train = y_train.reshape(-1, 1)
        
        # Adatta il modello con i dati di training e validazione 
        metrics = hyper_ae.fit(hp, model, x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size)
        
        val_out = model.predict(x_val, verbose=False)
        val_out = (val_out >= 0.5).astype(int)
        f1_0_scores.append(f1_class_0._score_func(y_val, val_out))
        f1_1_scores.append(f1_class_1._score_func(y_val, val_out))
        f1_macro_scores.append(metrics.history['val_f1_macro'][-1])
    
    mean_f1_macro, std_f1_macro = mean(f1_macro_scores), stdev(f1_macro_scores)
    mean_f1_0, std_f1_0 = mean(f1_0_scores), stdev(f1_0_scores)
    mean_f1_1, std_f1_1 = mean(f1_1_scores), stdev(f1_1_scores)
    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "mean_f1_macro": mean_f1_macro,
        "std_f1_macro": std_f1_macro,
        "mean_f1_0": mean_f1_0,
        "std_f1_0": std_f1_0,
        "mean_f1_1": mean_f1_1,
        "std_f1_1": std_f1_1
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
df.sort_values(by='mean_f1_macro', inplace=True, ascending=False)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_nn_results.csv', index=False)
    

## Rule-Based

In [None]:
param_dist = {
    'prune_size': sp_uniform(0.1, 0.4),  # Distribuzione uniforme tra 0.1 e 0.5
    'k': sp_randint(1, 11)               # Interi tra 1 e 10
}

#define the number of iters
n_iter_search = 10
#define the model
clf = lw.RIPPER()
#define the grid search
rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=n_iter_search, 
                                 scoring=scoring, 
                                 refit=False, 
                                 n_jobs=2)
#run the grid search
rand_search.fit(train_set, train_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_rule_based_results.csv', index=False)