# Classification: Bayesian Search

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [16]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import random
import numpy as np

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn import tree
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
import tensorflow as tf

import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from statistics import mean, stdev
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score


## Parameter Selection

In [12]:
# Merge results from previous random search
USER_1 = 'Jacopo'
USER_2 = 'Simone'

models = ['ada_boost', 'nn', 'xgb', 'naive_bayes', 'random_forest', 'decision_tree', 'svm', 'rule_based', 'knn']

for model in models:
    path_1 = f'../../data/ml_datasets/undersampling/model_selection/{USER_1}_{model}_results.csv'
    path_2 = f'../../data/ml_datasets/undersampling/model_selection/{USER_2}_{model}_results.csv'

    concatenate_path = f'../../data/ml_datasets/undersampling/model_selection/{model}_results.csv'

    df1 = pd.read_csv(path_1)
    df2 = pd.read_csv(path_2)

    # Concatena le righe
    df_concatenato = pd.concat([df1, df2], ignore_index=True)

    # Salva il risultato in un nuovo CSV
    df_concatenato.to_csv(concatenate_path, index=False)  #to concatenate the two files

In [2]:
columns_to_see = ['mean_test_f1_micro', 'std_test_f1_micro', 'mean_test_f1_1', 'std_test_f1_1', 'mean_test_f1_0', 'std_test_f1_0']

In [11]:
# df = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/ada_boost_results.csv')
# df.sort_values(by='mean_test_f1_micro', ascending=False, inplace=True)
# params= [col for col in df.columns if col.startswith("param_classifier__")]
# df.head(n=10)[columns_to_see+params]

In [3]:
models = ['ada_boost', 'nn', 'xgb', 'naive_bayes', 'random_forest', 'decision_tree', 'svm', 'rule_based', 'knn']

df_results = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/nn_results.csv')
df_results = df_results.rename(columns={'mean_f1_micro': 'mean_test_f1_micro', 
                                        'std_f1_micro': 'std_test_f1_micro',
                                        'mean_f1_1': 'mean_test_f1_1',
                                        'std_f1_1': 'std_test_f1_1',
                                        'mean_f1_0': 'mean_test_f1_0',
                                        'std_f1_0': 'std_test_f1_0'})
print(df_results.columns)
df_results = df_results[columns_to_see]
df_results['model'] = 'nn'
models.remove('nn')

columns_to_see = ['model'] + columns_to_see
for model in models:
    path = f'../../data/ml_datasets/undersampling/model_selection/{model}_results.csv'

    df = pd.read_csv(path)
    df['model'] = model
    df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
    df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
    df = df.head(10)
    df = df[columns_to_see + ['mean_test_f1_macro']]

    df_results = pd.concat([df_results, df], axis=0)

Index(['batch_size', 'epochs', 'units_layer1', 'units_layer2', 'drop_rate',
       'learning_rate', 'mean_test_f1_micro', 'std_test_f1_micro',
       'mean_test_f1_0', 'std_test_f1_0', 'mean_test_f1_1', 'std_test_f1_1'],
      dtype='object')


In [7]:
df_results.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df_results.head(40)

Unnamed: 0,mean_test_f1_micro,std_test_f1_micro,mean_test_f1_1,std_test_f1_1,mean_test_f1_0,std_test_f1_0,model,mean_test_f1_macro
0,0.725322,0.002026,0.419911,0.003151,0.820059,0.001453,random_forest,0.619985
50,0.72499,0.002114,0.419359,0.003605,0.819827,0.001481,random_forest,0.619593
51,0.722791,0.002282,0.417092,0.003846,0.818156,0.001593,random_forest,0.617624
1,0.722686,0.002104,0.417035,0.003778,0.818071,0.001451,random_forest,0.617553
52,0.722207,0.002004,0.415785,0.003599,0.817781,0.00138,random_forest,0.616783
2,0.722276,0.002406,0.41565,0.003703,0.817853,0.001732,random_forest,0.616752
53,0.721899,0.001767,0.414652,0.003378,0.817626,0.00122,random_forest,0.616139
3,0.719406,0.001844,0.411937,0.003243,0.815744,0.001277,random_forest,0.613841
4,0.718787,0.001393,0.411841,0.002804,0.815219,0.000973,random_forest,0.61353
56,0.718341,0.001925,0.41194,0.003375,0.814824,0.001332,random_forest,0.613382


Winner models for UnderSampling:
- Random Forests
- XGB
- Decision Tree
- Rule-Based

In [10]:
df = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/rule_based_results.csv')
df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__k,param_classifier__prune_size,params,split0_test_f1_micro,split1_test_f1_micro,split2_test_f1_micro,...,rank_test_f1_0,split0_test_f1_1,split1_test_f1_1,split2_test_f1_1,split3_test_f1_1,split4_test_f1_1,mean_test_f1_1,std_test_f1_1,rank_test_f1_1,mean_test_f1_macro
4,29.65058,0.959798,0.535664,0.009432,2,0.215088,"{'classifier__k': 2, 'classifier__prune_size':...",0.776664,0.817019,0.764813,...,5,0.326869,0.308588,0.331831,0.305433,0.306434,0.315831,0.011196,16,0.598528
5,29.549836,1.029161,0.539264,0.007292,2,0.417199,"{'classifier__k': 2, 'classifier__prune_size':...",0.814475,0.817019,0.764813,...,6,0.297786,0.308588,0.331831,0.336566,0.306434,0.316241,0.015176,15,0.59779
21,23.981485,1.940627,0.447366,0.011247,2,0.323858,"{'classifier__k': 2, 'classifier__prune_size':...",0.776664,0.767297,0.764813,...,2,0.326869,0.33945,0.331831,0.336566,0.306434,0.32823,0.011707,16,0.597506
3,22.038901,0.575556,0.534869,0.014071,1,0.313911,"{'classifier__k': 1, 'classifier__prune_size':...",0.814475,0.817019,0.846032,...,4,0.297786,0.308588,0.205528,0.305433,0.306434,0.284754,0.03978,17,0.591436
6,33.157996,2.206756,0.531289,0.029043,3,0.27926,"{'classifier__k': 3, 'classifier__prune_size':...",0.776664,0.817019,0.815044,...,7,0.326869,0.308588,0.298507,0.333602,0.334503,0.320414,0.014379,14,0.587756
22,27.272839,2.364907,0.448736,0.027963,3,0.18867,"{'classifier__k': 3, 'classifier__prune_size':...",0.703211,0.817019,0.764813,...,3,0.328924,0.308588,0.331831,0.336566,0.334503,0.328082,0.010079,17,0.587054
23,32.916554,3.984933,0.430428,0.022475,9,0.123937,"{'classifier__k': 9, 'classifier__prune_size':...",0.776664,0.698962,0.764813,...,4,0.326869,0.34116,0.331831,0.335825,0.33075,0.333287,0.004861,10,0.585346
1,22.768717,0.640888,0.536343,0.008657,1,0.294978,"{'classifier__k': 1, 'classifier__prune_size':...",0.814475,0.817019,0.815044,...,2,0.297786,0.308588,0.298507,0.217252,0.20925,0.266277,0.043537,18,0.584265
20,18.636901,1.071154,0.459237,0.015773,1,0.260647,"{'classifier__k': 1, 'classifier__prune_size':...",0.814475,0.847154,0.815044,...,1,0.297786,0.221358,0.298507,0.305433,0.20925,0.266467,0.042034,20,0.584189
2,22.040209,2.100988,0.518756,0.013818,1,0.184002,"{'classifier__k': 1, 'classifier__prune_size':...",0.814475,0.84696,0.815044,...,3,0.297786,0.212625,0.298507,0.305433,0.20925,0.26472,0.044007,19,0.583314


Rule Based:
- k: 1,2,3
- prune_size: 0.2-0.5

In [14]:
df = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/decision_tree_results.csv')
df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df = df.drop(columns=['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time'])
df.head(10)

Unnamed: 0,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_depth,param_classifier__max_features,param_classifier__min_samples_leaf,param_classifier__min_samples_split,params,split0_test_f1_micro,split1_test_f1_micro,split2_test_f1_micro,...,rank_test_f1_0,split0_test_f1_1,split1_test_f1_1,split2_test_f1_1,split3_test_f1_1,split4_test_f1_1,mean_test_f1_1,std_test_f1_1,rank_test_f1_1,mean_test_f1_macro
152,,entropy,10.0,14,10,20,"{'classifier__class_weight': None, 'classifier...",0.681814,0.732374,0.694996,...,3,0.372028,0.392995,0.374839,0.372937,0.378581,0.378276,0.007695,1,0.590726
5,,entropy,10.0,15,10,20,"{'classifier__class_weight': None, 'classifier...",0.694592,0.704019,0.694293,...,7,0.375256,0.382018,0.374874,0.373493,0.374102,0.375949,0.003095,6,0.5872
7,,gini,10.0,12,30,100,"{'classifier__class_weight': None, 'classifier...",0.693006,0.681694,0.704647,...,10,0.370509,0.374232,0.379959,0.378871,0.37813,0.37634,0.003497,4,0.587063
153,,entropy,10.0,12,50,100,"{'classifier__class_weight': None, 'classifier...",0.698333,0.710842,0.690941,...,5,0.37611,0.383749,0.373882,0.372105,0.374731,0.376116,0.004031,3,0.58703
6,,entropy,10.0,11,10,30,"{'classifier__class_weight': None, 'classifier...",0.696972,0.693246,0.681186,...,8,0.374514,0.377569,0.368213,0.384972,0.373377,0.375729,0.005519,7,0.586801
156,,entropy,10.0,11,50,30,"{'classifier__class_weight': None, 'classifier...",0.690388,0.69745,0.682847,...,7,0.373843,0.380818,0.37059,0.373712,0.378422,0.375477,0.003657,5,0.585963
154,,gini,10.0,6,100,30,"{'classifier__class_weight': None, 'classifier...",0.70969,0.693859,0.688338,...,4,0.376342,0.375839,0.369907,0.367353,0.378875,0.373663,0.004313,13,0.585916
19,,gini,15.0,8,100,50,"{'classifier__class_weight': None, 'classifier...",0.683834,0.685555,0.682353,...,22,0.374556,0.376724,0.37479,0.381558,0.382572,0.37804,0.003387,1,0.58562
10,,entropy,10.0,11,50,20,"{'classifier__class_weight': None, 'classifier...",0.706338,0.694637,0.681216,...,11,0.376873,0.381388,0.370289,0.377881,0.369842,0.375255,0.004496,10,0.585538
25,,gini,,10,100,20,"{'classifier__class_weight': None, 'classifier...",0.679929,0.68768,0.690523,...,34,0.371352,0.379721,0.374664,0.378293,0.382666,0.377339,0.003949,2,0.584799


Decision Tree:
- Class Weight: NaN
- criterion: entropy, gini
- max_depth: 8-12
- max_features: 11 in su
- min_samples_leaf: 5-70
- min_samples_split: 10-50

In [12]:
df = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/xgb_results.csv')
df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df.head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_f1_micro,split1_test_f1_micro,...,rank_test_f1_0,split0_test_f1_1,split1_test_f1_1,split2_test_f1_1,split3_test_f1_1,split4_test_f1_1,mean_test_f1_1,std_test_f1_1,rank_test_f1_1,mean_test_f1_macro
1,0.548884,0.045819,0.095229,0.010362,0.1,5,100,"{'classifier__learning_rate': 0.1, 'classifier...",0.700952,0.70562,...,2,0.394914,0.402297,0.39242,0.394662,0.4015,0.397159,0.003974,1,0.600317
61,0.424271,0.014148,0.063366,0.001371,0.1,5,100,"{'classifier__learning_rate': 0.1, 'classifier...",0.700952,0.70562,...,2,0.394914,0.402297,0.39242,0.394662,0.4015,0.397159,0.003974,1,0.600317
60,0.315308,0.015292,0.044142,0.004323,1.0,3,100,"{'classifier__learning_rate': 1, 'classifier__...",0.702702,0.706877,...,1,0.393257,0.399963,0.392393,0.393311,0.397178,0.395221,0.002892,2,0.600007
0,0.405435,0.023237,0.07011,0.006422,1.0,3,100,"{'classifier__learning_rate': 1, 'classifier__...",0.702702,0.706877,...,1,0.393257,0.399963,0.392393,0.393311,0.397178,0.395221,0.002892,2,0.600007
3,0.48148,0.016178,0.081959,0.01272,1.0,4,100,"{'classifier__learning_rate': 1, 'classifier__...",0.70176,0.704199,...,10,0.392391,0.399575,0.391826,0.393311,0.396355,0.394692,0.002899,3,0.598546
63,0.349926,0.013094,0.055526,0.003796,1.0,4,100,"{'classifier__learning_rate': 1, 'classifier__...",0.70176,0.704199,...,10,0.392391,0.399575,0.391826,0.393311,0.396355,0.394692,0.002899,3,0.598546
62,0.232823,0.010379,0.034679,0.003098,1.0,3,50,"{'classifier__learning_rate': 1, 'classifier__...",0.697106,0.707431,...,8,0.389296,0.399349,0.390053,0.387777,0.395708,0.392437,0.004376,7,0.597755
2,0.288855,0.012391,0.057673,0.0053,1.0,3,50,"{'classifier__learning_rate': 1, 'classifier__...",0.697106,0.707431,...,8,0.389296,0.399349,0.390053,0.387777,0.395708,0.392437,0.004376,7,0.597755
4,0.554791,0.021151,0.089254,0.009682,1.0,5,100,"{'classifier__learning_rate': 1, 'classifier__...",0.700937,0.704019,...,9,0.390101,0.394354,0.390263,0.393883,0.394702,0.392661,0.002041,6,0.597543
64,0.416122,0.008778,0.070775,0.011573,1.0,5,100,"{'classifier__learning_rate': 1, 'classifier__...",0.700937,0.704019,...,9,0.390101,0.394354,0.390263,0.393883,0.394702,0.392661,0.002041,6,0.597543


xgb:
- lr: 0.1-1
- max_depth: 3-8
- n_estimators: 75, 100, 150, 200

In [15]:
df = pd.read_csv('../../data/ml_datasets/undersampling/model_selection/random_forest_results.csv')
df['mean_test_f1_macro'] = (df['mean_test_f1_1'] + df['mean_test_f1_0']) / 2
df.sort_values(by='mean_test_f1_macro', ascending=False, inplace=True)
df = df.drop(columns=['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time'])
df.head(10)

Unnamed: 0,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_depth,param_classifier__max_features,param_classifier__min_samples_leaf,param_classifier__min_samples_split,param_classifier__n_estimators,params,split0_test_f1_micro,split1_test_f1_micro,...,rank_test_f1_0,split0_test_f1_1,split1_test_f1_1,split2_test_f1_1,split3_test_f1_1,split4_test_f1_1,mean_test_f1_1,std_test_f1_1,rank_test_f1_1,mean_test_f1_macro
0,,gini,20.0,9,10,20,50,"{'classifier__class_weight': None, 'classifier...",0.722214,0.727451,...,1,0.41647,0.423594,0.415902,0.421063,0.422525,0.419911,0.003151,1,0.619985
50,,gini,,15,10,20,50,"{'classifier__class_weight': None, 'classifier...",0.722723,0.728379,...,1,0.417026,0.42512,0.414789,0.41838,0.421482,0.419359,0.003605,1,0.619593
51,,entropy,20.0,13,10,50,100,"{'classifier__class_weight': None, 'classifier...",0.720359,0.726314,...,2,0.412942,0.422578,0.414019,0.415162,0.420761,0.417092,0.003846,2,0.617624
1,,entropy,20.0,14,10,50,100,"{'classifier__class_weight': None, 'classifier...",0.720373,0.726329,...,2,0.413434,0.423647,0.414025,0.415358,0.41871,0.417035,0.003778,2,0.617553
52,,gini,,15,10,50,100,"{'classifier__class_weight': None, 'classifier...",0.719999,0.725685,...,3,0.411892,0.421508,0.412524,0.414891,0.418112,0.415785,0.003599,3,0.616783
2,,gini,20.0,4,10,50,100,"{'classifier__class_weight': None, 'classifier...",0.719027,0.72588,...,3,0.412012,0.421023,0.411833,0.414488,0.418897,0.41565,0.003703,3,0.616752
53,,gini,20.0,4,10,50,50,"{'classifier__class_weight': None, 'classifier...",0.720418,0.724922,...,4,0.41244,0.420356,0.411584,0.412188,0.41669,0.414652,0.003378,4,0.616139
3,,entropy,,5,10,100,150,"{'classifier__class_weight': None, 'classifier...",0.718219,0.722693,...,4,0.410431,0.41733,0.407887,0.410466,0.413573,0.411937,0.003243,5,0.613841
4,,entropy,,6,30,20,150,"{'classifier__class_weight': None, 'classifier...",0.716947,0.720822,...,5,0.410189,0.415952,0.408287,0.41059,0.414185,0.411841,0.002804,6,0.61353
56,,entropy,20.0,12,30,50,100,"{'classifier__class_weight': None, 'classifier...",0.716274,0.721226,...,7,0.408768,0.417144,0.408451,0.410856,0.414481,0.41194,0.003375,5,0.613382


random forest:
- class_weight: NaN
- criterion: entropy, gini
- max_depth: None
- max_features: 4 in su
- min_samples_leaf: 5-30
- min_samples_split: 10-50
- n_estimators: 50, 75, 100, 150

## Bayesian Search

In [17]:
RANDOM_STATE = 42
dev_data = pd.read_csv('../../data/ml_datasets/undersampling/dev_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
testing_data = pd.read_csv('../../data/ml_datasets/undersampling/test_set.csv')

In [18]:
dev_label = dev_data.pop('label')
test_label = testing_data.pop('label')

dev_set = dev_data
dev_set['race_season%autumn'] = dev_set['race_season%autumn'].astype(int)
dev_set['race_season%spring'] = dev_set['race_season%spring'].astype(int)
dev_set['race_season%summer'] = dev_set['race_season%summer'].astype(int)
dev_set['race_season%winter'] = dev_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 8
USER = 'Jacopo'
RUS = RandomUnderSampler(random_state=RANDOM_STATE)
SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [19]:
#define the parameters' values you want to try
def f1_class_scorer(class_index):
    def score_function(y_true, y_pred):
        # Calcola F1 per ciascuna classe e ritorna quella specificata
        return f1_score(y_true, y_pred, average=None)[class_index]
    return make_scorer(score_function)

# Scorer per la classe 0 e 1
f1_class_0 = f1_class_scorer(0)  # Classe 0
f1_class_1 = f1_class_scorer(1)  # Classe 1


scoring={
        'f1_micro': 'f1_micro',   # F1 macro per entrambe le classi
        'f1_0': f1_class_0,  # F1 solo per classe 0
        'f1_1': f1_class_1   # F1 solo per classe 1
}

### Decision Tree

Decision Tree:
- Class Weight: NaN
- criterion: entropy, gini
- max_depth: 8-12
- max_features: 11 in su
- min_samples_leaf: 5-70
- min_samples_split: 10-50

In [23]:
param_dist = {"classifier__max_depth": Integer(8, 12),
              "classifier__max_features": Integer(11, len(dev_set.iloc[0]) + 1),
              "classifier__min_samples_split": Integer(10, 50),
              "classifier__min_samples_leaf": Integer(5, 70),
              "classifier__criterion": Categorical(['gini', 'entropy'])}
#define the number of iters
n_iter_search = 1000 # Total-Iteration: 400
#define the model
clf = tree.DecisionTreeClassifier()

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

#define the grid search
rand_search = BayesSearchCV(pipeline, search_spaces=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro',
                            refit=False,
                            cv=SKF)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [27]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_decision_tree_results_bayes.csv', index=False)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,param_classifier__max_features,param_classifier__min_samples_leaf,param_classifier__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.997583,0.013231,0.022258,0.001298,gini,10,13,38,25,"{'classifier__criterion': 'gini', 'classifier_...",0.574252,0.590915,0.59292,0.592638,0.588377,0.58782,0.006974,1
1,0.909312,0.042417,0.022073,0.001366,gini,8,16,30,50,"{'classifier__criterion': 'gini', 'classifier_...",0.575936,0.57841,0.595357,0.590289,0.592046,0.586408,0.007753,2
6,0.826132,0.020258,0.018859,0.000698,gini,8,13,59,12,"{'classifier__criterion': 'gini', 'classifier_...",0.581152,0.57145,0.587806,0.596687,0.592311,0.585881,0.008859,3
3,0.980756,0.010286,0.025006,0.002709,gini,9,13,24,48,"{'classifier__criterion': 'gini', 'classifier_...",0.589508,0.605919,0.586973,0.565227,0.580786,0.585683,0.013177,4
2,0.994999,0.029426,0.021668,0.001112,gini,11,12,13,29,"{'classifier__criterion': 'gini', 'classifier_...",0.583213,0.593568,0.577694,0.582599,0.585532,0.584521,0.005194,5


### XGBoost

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_xgb_results_bayes.csv', index=False)
df.head()

### Random Forest

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_random_forest_results_bayes.csv', index=False)
df.head()

### Rule-Based

Rule Based:
- k: 1,2,3
- prune_size: 0.2-0.5

In [None]:
param_dist = {
    'classifier__prune_size': Real(0.2, 0.4),
    'classifier__k': Categorical([1, 2, 3])
}

#define the number of iters
n_iter_search = 20
#define the model
clf = lw.RIPPER(
    max_rules=10,        # Moderate rule complexity
    max_rule_conds=7,    # Enough room for moderately complex conditions
    max_total_conds=35   # Cap total conditions to avoid runaway complexity
)

pipeline = ImbPipeline([
    ('undersampler', RUS),  # Passo di undersampling
    ('classifier', clf)  # Il classificatore
], verbose=False)

#define the grid search
rand_search = BayesSearchCV(pipeline, search_spaces=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring='f1_macro',
                            refit=False,
                            cv=SKF)
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.to_csv(f'../../data/ml_datasets/undersampling/model_selection/{USER}_rule_based_bayes.csv', index=False)
df.head()