# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn import tree
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42

In [None]:
dev_data = pd.read_csv('../data/ml_datasets/oversampling/dev_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
testing_data = pd.read_csv('../data/ml_datasets/oversampling/test_set.csv')

In [None]:
dev_label = dev_data.pop('label')
test_label = testing_data.pop('label')

dev_set = dev_data
dev_set['race_season%autumn'] = dev_set['race_season%autumn'].astype(int)
dev_set['race_season%spring'] = dev_set['race_season%spring'].astype(int)
dev_set['race_season%summer'] = dev_set['race_season%summer'].astype(int)
dev_set['race_season%winter'] = dev_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 4

## Decision Tree

In [None]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {"max_depth": [2,3,5,6,7,10,12,None],
              "max_features": sp_randint(1, len(dev_set.iloc[0]) + 1),
              "min_samples_split": sp_randint(10, 51),
              "min_samples_leaf": sp_randint(10, 51),
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.3, 1: 0.7}]}
#define the number of iters
n_iter_search = 200
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## SVM

In [None]:
#define the parameters' values you want to try
param_dist = {"C": sp_uniform(0.1, 10.0)}
#define the number of iters
n_iter_search = 1
#define the model
clf = LinearSVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## Naive Bayes

In [None]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, #CrossValidation per confrontabilità, non model selection
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## KNN

- Rischiamo che il mapping degli attributi categorici ordinali (senza one-hot) crei problemi nel K-NN

In [None]:
param_dist = {'n_neighbors': sp_randint(10, 20),
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_dev_set = dev_set.drop(columns=['cyclist_age_group_num', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter'])
#define the number of iters
n_iter_search = 10
#define the model
clf = KNeighborsClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(tmp_dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## Rule-Based

In [None]:
param_dist = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}

#define the number of iters
n_iter_search = 10
#define the model
clf = lw.RIPPER()
#define the grid search
rand_search = GridSearchCV(estimator=clf, param_grid=param_dist, scoring='f1_macro', n_jobs=N_JOBS)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## Random Forest

In [None]:
clf = RandomForestClassifier()

param_dist = {"max_depth": [2,3,5,6,7,10,12,None],
              "max_features": sp_randint(1, len(dev_set.iloc[0]) + 1),
              "min_samples_split": sp_randint(10, 51),
              "min_samples_leaf": sp_randint(10, 51),
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.3, 1: 0.7}],
              "n_estimators": [33, 100, 250]}

n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## XGBoost

In [None]:
clf = XGBClassifier()
param_dist = {
    "n_estimators": [25, 100, 250],
    "max_depth": [2, 3],
    "learning_rate": [1, 0.1, 0.001, 0.0001]
}
n_iter_search = 20
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## AdaBoost

In [None]:
clf = AdaBoostClassifier()
param_dist = {
    "n_estimators": [25, 100, 250],
    "learning_rate": [1, 0.1, 0.001, 0.0001],
    "algorithm": ['SAMME']
}
n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## Neural Network

In [None]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configura l'ottimizzatore con il learning rate scelto
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_score", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[f1])
  
        return model
    
    def fit(self, hp, model, x, y, validation_data, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            validation_data=validation_data,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [None]:
# Parametri di KFold
dev_x = dev_set.to_numpy()
dev_y = dev_label.to_numpy()
kf = model_selection.KFold(n_splits=5, shuffle=False)
hyper_ae = MyHyperModel()
hp = HyperParameters()

rounds = 5
config_results = []

for _ in range(rounds):
    batch_size = hp.Choice("batch_size", [512, 1024])
    epochs = hp.Choice("epochs", [2, 3])
    units_layer1 = hp.Choice('units_layer1', [32, 64, 128])
    drop_rate = hp.Float('rate', 0, 0.5, step=0.1)
    learning_rate = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)
    f1_scores = []

    for train_index, val_index in kf.split(dev_x, dev_y):
        x_train, x_val = dev_x[train_index], dev_x[val_index]
        y_train, y_val = dev_y[train_index], dev_y[val_index]
        y_val = y_val.reshape(-1, 1)
        y_train = y_train.reshape(-1, 1)
        
        # Adatta il modello con i dati di training e validazione 
        metrics = hyper_ae.fit(hp, model, x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size)
        f1_scores.append(metrics.history['val_f1_score'][-1])
    
    mean_f1, std_f1 = mean(f1_scores), stdev(f1_scores)
    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "mean_f1": mean_f1,
        "std_f1": std_f1
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
df.sort_values(by='mean_f1', inplace=True, ascending=False)
    