# Classification: Bayesian Search

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import random
import numpy as np

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn import tree
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf
from sklearn.model_selection import PredefinedSplit

import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score


## Parameter Selection

In [12]:
# Merge results from previous random search
USER_1 = 'Jacopo'
USER_2 = 'Simone'

models = ['ada_boost', 'nn', 'xgb', 'naive_bayes', 'random_forest', 'decision_tree', 'svm', 'rule_based', 'knn']

for model in models:
    path_1 = f'../../data/ml_datasets/oversampling/model_selection/{USER_1}_{model}_results.csv'
    path_2 = f'../../data/ml_datasets/oversampling/model_selection/{USER_2}_{model}_results.csv'

    concatenate_path = f'../../data/ml_datasets/oversampling/model_selection/{model}_results.csv'

    df1 = pd.read_csv(path_1)
    df2 = pd.read_csv(path_2)

    df2['mean_test_f1_macro'] = (df2['mean_test_f1_1'] + df2['mean_test_f1_0']) / 2
    df2['mean_test_f1_macro'] = (df2['std_test_f1_1'] + df2['std_test_f1_0']) / 2

    # Concatena le righe
    df_concatenato = pd.concat([df1, df2], ignore_index=True)

    # Salva il risultato in un nuovo CSV
    df_concatenato.to_csv(concatenate_path, index=False)  #to concatenate the two files

In [2]:
columns_to_see = ['mean_test_f1_micro', 'std_test_f1_micro', 'mean_test_f1_1', 'std_test_f1_1', 'mean_test_f1_0', 'std_test_f1_0', 'mean_test_f1_macro', 'std_test_f1_macro']

In [11]:
# df = pd.read_csv('../../data/ml_datasets/oversampling/model_selection/ada_boost_results.csv')
# df.sort_values(by='mean_test_f1_micro', ascending=False, inplace=True)
# params= [col for col in df.columns if col.startswith("param_classifier__")]
# df.head(n=10)[columns_to_see+params]

In [None]:
models = ['ada_boost', 'nn', 'xgb', 'naive_bayes', 'random_forest', 'decision_tree', 'svm', 'rule_based', 'knn']

df_results = pd.read_csv('../../data/ml_datasets/oversampling/model_selection/nn_results.csv')
df_results = df_results.rename(columns={'mean_f1_micro': 'mean_test_f1_micro', 
                                        'std_f1_micro': 'std_test_f1_micro',
                                        'mean_f1_1': 'mean_test_f1_1',
                                        'std_f1_1': 'std_test_f1_1',
                                        'mean_f1_0': 'mean_test_f1_0',
                                        'std_f1_0': 'std_test_f1_0',
                                        'mean_f1_macro': 'mean_test_f1_macro', 
                                        'std_f1_macro': 'std_test_f1_macro',})
print(df_results.columns)
df_results = df_results[columns_to_see]
df_results['model'] = 'nn'
models.remove('nn')

columns_to_see = ['model'] + columns_to_see
for model in models:
    path = f'../../data/ml_datasets/oversampling/model_selection/{model}_results.csv'

    df = pd.read_csv(path)
    df['model'] = model
    df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
    df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
    df = df.head(10)
    df = df[columns_to_see + ['mean_test_f1_macro']]

    df_results = pd.concat([df_results, df], axis=0)

In [None]:
df_results.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df_results.head(40)

Winner models for oversampling:
- Random Forests
- XGB
- Decision Tree
- Rule-Based

## Bayesian Search

In [2]:
RANDOM_STATE = 42

train_data = pd.read_csv('../../data/ml_datasets/oversampling/train_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
val_data = pd.read_csv('../../data/ml_datasets/oversampling/val_set.csv')
testing_data = pd.read_csv('../../data/ml_datasets/oversampling/test_set.csv')

In [3]:
train_label = train_data.pop('label')
val_label = val_data.pop('label')
test_label = testing_data.pop('label')

train_set = train_data
train_set['race_season%autumn'] = train_set['race_season%autumn'].astype(int)
train_set['race_season%spring'] = train_set['race_season%spring'].astype(int)
train_set['race_season%summer'] = train_set['race_season%summer'].astype(int)
train_set['race_season%winter'] = train_set['race_season%winter'].astype(int)

val_set = val_data
val_set['race_season%autumn'] = val_set['race_season%autumn'].astype(int)
val_set['race_season%spring'] = val_set['race_season%spring'].astype(int)
val_set['race_season%summer'] = val_set['race_season%summer'].astype(int)
val_set['race_season%winter'] = val_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 4
USER = 'Jacopo'

In [4]:
#define the parameters' values you want to try
def f1_class_scorer(class_index):
    def score_function(y_true, y_pred):
        # Calcola F1 per ciascuna classe e ritorna quella specificata
        return f1_score(y_true, y_pred, average=None)[class_index]
    return make_scorer(score_function)

# Scorer per la classe 0 e 1
f1_class_0 = f1_class_scorer(0)  # Classe 0
f1_class_1 = f1_class_scorer(1)  # Classe 1


scoring={
        'f1_macro': 'f1_macro',   # F1 macro per entrambe le classi
        'f1_0': f1_class_0,  # F1 solo per classe 0
        'f1_1': f1_class_1   # F1 solo per classe 1
}

In [5]:
i = 1
def func(*args):
    global i
    print(f'Configurazione: {i}')
    i += 1

In [None]:
N_FEATURES = len(train_set.iloc[0])

train_set = train_set.to_numpy()
train_label = train_label.to_numpy()

val_set = val_set.to_numpy()
val_label = val_label.to_numpy()

split_index = np.concatenate([
    np.full(len(train_set), -1),  # -1 per training
    np.zeros(len(val_set))   # 0 per validation
])

X_combined = np.vstack((train_set, val_set))
y_combined = np.concatenate((train_label, val_label))

ps = PredefinedSplit(test_fold=split_index)

### Decision Tree

Decision Tree:
- Class Weight: NaN
- criterion: entropy, gini
- max_depth: 8-12
- max_features: 11 in su
- min_samples_leaf: 5-70
- min_samples_split: 10-50

In [None]:
param_dist = {"classifier__max_depth": Integer(8, 20),
              "classifier__max_features": Integer(11, N_FEATURES),
              "classifier__min_samples_split": Integer(10, 50),
              "classifier__min_samples_leaf": Integer(5, 70),
              "classifier__criterion": Categorical(['gini', 'entropy'])}
#define the number of iters
n_iter_search = 100
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = BayesSearchCV(clf, search_spaces=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit='f1_macro',
                            cv=ps)
#run the grid search
rand_search.fit(X_combined, y_combined, callback=func);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_decision_tree_results_bayes.csv', index=False)
df.head(n=10)[['mean_test_f1_macro', 'std_test_f1_macro', 'mean_test_f1_1', 'std_test_f1_1', 'mean_test_f1_0', 'std_test_f1_0']]