In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict,cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer, confusion_matrix, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import optuna


In [82]:
filepath = r"C:\Users\lucap\OneDrive\Desktop\PhD\Racheal\Dati\dataset_clean.csv"
df = pd.read_csv(filepath, sep = ";" )

In [83]:
df.shape

(4504, 161)

In [84]:
diz_aliases_int_voto = {"Partito Democratico-PD":'Sx/CSx',
                "Partito Democratico":'Sx/CSx',
                "Lega con Salvini":'Dx/CDx',
                "Lega Nord":'Dx/CDx',
                "Lega":'Dx/CDx',
                "Forza Italia":'Dx/CDx',
                "Fratelli d'Italia":"Dx/CDx",
                'MoVimento 5 Stelle':'M5S',
                'Movimento 5 stelle':'M5S',
                'voterei  scheda bianca / annullerei la scheda':'astensione/bianca/nulla',
                'voterei scheda bianca / scheda nulla':'astensione/bianca/nulla',
                "+Europa":'Sx/CSx',
                "piu' Europa con Emma Bonino": "Sx/CSx",
                'Sinistra italiana (SEL + altri)':'Sx/CSx',
                'Potere al Popolo':'Sx/CSx',
                'Rifondazione Comunista':'Sx/CSx',
                "Fratelli d'Italia-Alleanza Nazionale&nbsp;":"Dx/CDx",
                'La Sinistra':'Sx/CSx',
                'Verdi':'Sx/CSx',
                'non andrei a votare':'astensione/bianca/nulla',
                'sono indeciso' : 'indecisi',
                'Liberi e Uguali' : 'Sx/CSx',
                'Italia dei Valori' : 'Sx/CSx',
                'Scelta Civica&nbsp;' : 'Sx/CSx',
                'Italia Unica di Corrado Passera' : 'Sx/CSx',
                "Noi con l'Italia UDC&nbsp;" : 'Dx/CDx',
                "altro partito di area di governo (SVP, Centro Democratico....)" : 'Altro partito',
                "Nuovo Centro Destra con UDC e PPI" : 'Dx/CDx'
                }

In [85]:
df['m_p_int_voto'] = df['m_p_int_voto'].replace(diz_aliases_int_voto)
df['m_p_int_voto'].value_counts()

m_p_int_voto
Sx/CSx                       1348
Dx/CDx                       1102
M5S                           869
indecisi                      695
astensione/bianca/nulla       274
preferisco non rispondere     150
Altro partito                  66
Name: count, dtype: int64

In [86]:
df = df[~df['m_p_int_voto'].isin(['Altro partito', 'preferisco non rispondere', 'indecisi','astensione/bianca/nulla' ])]



In [87]:
df['m_p_int_voto'].value_counts()


m_p_int_voto
Sx/CSx    1348
Dx/CDx    1102
M5S        869
Name: count, dtype: int64

In [88]:
diz_aliases_ac = {"del tutto d'accordo":2,
            "del tutto<BR>d'accordo":2,
		     "d'accordo":1,
		     "ne' d'accordo ne' in disaccordo (NON STIMOLARE)":0,
             "ne d'accordo ne' in disaccordo":0,
             "ne' d'accordo ne' in disaccordo":0,
             "preferisco non rispondere":0,
             "preferisco<BR>non<BR> rispondere":0,
            "non saprei":0,
             "in disaccordo":-1,
             'del tutto in disaccordo':-2,
             "del tutto disaccordo":-2,
             "del tutto<BR>in disaccordo":-2,
             
            
}

nomi_var = df.columns.tolist()
for var in nomi_var:
    if '_ac_' in var:
        df[var] = df[var].replace(diz_aliases_ac)
        
        
diz_aliases_op = {"molto":2,
            "poco":-1,
            "pocao":-1,
		     "abbastanza":1,
		     "non saprei":0,
            "nessuno":-2,
            "per niente":-2}

nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'op' in var:
        df[var] = df[var].replace(diz_aliases_op)
        

diz_aliases_trend = {"in trend":1,
                 "slight in trend":1,
                 "slight off trend":0,
                 "off trend":0,
                 'non classificati':0}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'TREND' in var:
        df[var] = df[var].replace(diz_aliases_trend)
        
        
diz_aliases_scol = {"elementare/privo di titolo":1,
                 "media inferiore":1,
                 "diploma di maturita` (5 anni)":2,
                 "superiori in corso":2,
                 'diploma di istituto professionale (3 anni)':2,
                 'universita` in corso/nessuna laurea conseguita':4,
                 'laurea triennale di I livello':4,
                 'diploma universitario/laurea breve':4,
                 'laurea specialistica di II livello o laurea 4-5 anni':4,
                 'master/scuola di specializzazione post laurea':4,
                 'dottorato di ricerca':4
                 }
 
df['m_p_scolarita'] = df['m_p_scolarita'].replace(diz_aliases_scol)



diz_aliases_geo = {'meno di 5.000':1,
                 'da 5.001 a 10.000':1,
                 'da 10.001 a 30.000':2,
                 'da 30.001 a 100.000':2,
                 'da 100.001 a 250.000':3,
                 'piu` di 250.001':3
                 }
 
df['m_p_r_ampiezza6'] = df['m_p_r_ampiezza6'].replace(diz_aliases_geo)

diz_aliases_eco = {
                 "mi sento povero e non arrivo mai a fine mese":-1,
                 "avverto difficolta'":-1,
                 "arrivo a fine mese con molte difficolta'":-1,
                 "con tranquillita'":1,
                 'agiatamente':1
                 }
 
df['m_op_come_reddito_consenteDiVivere'] = df['m_op_come_reddito_consenteDiVivere'].replace(diz_aliases_eco)


df = pd.get_dummies(df, columns=['m_op_preferenza_lavorativa'], prefix='prf_lav',dtype=int)

#df = pd.get_dummies(df, columns=['m_p_zona_5istat'], prefix='zona')
df = pd.get_dummies(df, columns=['m_istat_reg'], prefix='reg',dtype=int)

diz_aliases_quot = {
                 0:0,
                 'preferisco non rispondere':0,
                 "si alcune volte alla settimana":1,
                 "si ogni giorno":1
                 }
 
df['m_p_lettura_quotidiani'] = df['m_p_lettura_quotidiani'].replace(diz_aliases_quot)

diz_aliases_freq_y = {'mai':0,
                     "piu' raramente":0,
                 "alcune volte l'anno":1,
                  "piu' volte alla settimana":2,
                 'mensile':1,
                 'settimanale':2}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_p_' in var:
        df[var] = df[var].replace(diz_aliases_freq_y)
        
        
diz_aliases_freq = {'non saprei':0,
                     "non ascolto la radio":0,
                 "da piu' di un'ora fino a due ore":1,
                  "da piu' di tre ore fino a quattro ore":2,
                 'oltre quattro':2,
                 "da piu' di due ore fino a tre ore":2,
                 "meno di un'ora":1}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_p_' in var:
        df[var] = df[var].replace(diz_aliases_freq)
        

diz_aliases_comp = {
                 0:0,
                 'inferiore':-1,
                 'uguale':0,
                 "maggiore":1,
                 'nan':0
                 }
 
df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'] = df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'].replace(diz_aliases_comp)


diz_aliases_situazEconomic = {
    'rimasta la stessa': 0,
    'peggiorata': -1,
    'non sa/non risponde': 0,
    'nan': 0,
    'migliorata': 1
}
df['m_op_situazEconomic_propria_ultimi10anni'] = df['m_op_situazEconomic_propria_ultimi10anni'].replace(diz_aliases_situazEconomic)


diz_aliases_futEconomic = {
    'peggiorare': -1,
    'rimanere la stessa': 0,
    'non sa/non risponde': 0,
    'nan': 0,
    'migliorare': 1
}
df['m_op_situazEconomic_futura'] = df['m_op_situazEconomic_futura'].replace(diz_aliases_futEconomic)


diz_aliases_abo = {
    'una cattiva legge': -1,
    'una legge buona ma che va cambiata': 1,
    'una buona legge': 1,
    'non saprei/preferisco non rispondere': 0,
    'nan': 0
}

df['m_op_legge_aborto_1987'] = df['m_op_legge_aborto_1987'].replace(diz_aliases_abo)


diz_aliases_eut = {
    'no': -1,
    'si': 2,
    'non sa/non risponde': 0,
    'dipende dalle condizioni':1
}

df['m_op_favore_eutanasia_a_determinate_condizioni'] = df['m_op_favore_eutanasia_a_determinate_condizioni'].replace(diz_aliases_eut)


diz_aliases_identita = {
    "piu' europeo che italiano": 2,
    'sia italiano che europeo': 1,
    'solo italiano': -2,
    "piu' italiano che europeo": -1,
    'non sa - non risponde': 0,
    'solo europeo': 2,
    'nan': 0
}
df['m_op_sentimento_italianoVSeuropeo'] = df['m_op_sentimento_italianoVSeuropeo'].replace(diz_aliases_identita)


diz_aliases_modernizzazione = {
    'si sta modernizzando': 1,
    'sta regredendo': -1,
    'non sa/non risponde': 0,
    'nan': 0
}
df['m_modernizz_vs_regress_Paese'] = df['m_modernizz_vs_regress_Paese'].replace(diz_aliases_modernizzazione)


diz_aliases_vantaggi = {
    "piu' vantaggi": 1,
    'vantaggi e svantaggi in egual misura': 0,
    "piu' svantaggi": -1,
    "ne' vantaggi ne' svantaggi": 0,
    'non sa/non risponde': 0,
    'nan': None
}
df['m_op_partecipazione_in_UE'] = df['m_op_partecipazione_in_UE'].replace(diz_aliases_vantaggi)


diz_aliases_livello = {
    'bassa': -1,
    'media': 1,
    'alta': 2,
    0: 0,
    'nan': None
}
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_op_attenzione_' in var:
        df[var] = df[var].replace(diz_aliases_livello)
        
        
diz_aliases_andamento = {
    'rimasta uguale': 0,
    'peggiorata': -1,
    'migliorata': 1,
    0: 0,
    'nan': None
}
df['m_op_scuolaPubblica_ultimiAnni'] = df['m_op_scuolaPubblica_ultimiAnni'].replace(diz_aliases_andamento)


diz_aliases_quantita = {
    1: 1,
    'poca&nbsp;': -1,
    'molta': 2,
    'nessuna': -2,
    0: 0,
    'nan': None
}
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_op_fiducia_' in var:
        df[var] = df[var].replace(diz_aliases_quantita)
        
        
diz_aliases_postoStabile = {
    "d'accordo": 1,
    'in disaccordo': -1,
    "del tutto d'accordo": 2,
    'del tutto in disaccordo': -2,
    'preferisco non rispondere': 0,
    'nan': None
}
df['m_op_rinuncie_per_postoStabile'] = df['m_op_rinuncie_per_postoStabile'].replace(diz_aliases_postoStabile)


diz_aliases_pop = {
    'molti danni': 2,
    'nessun danno': 0,
    'qualche danno': 1,
    0: 0,
    'nan': None
}
df['m_op_danni_populismo_in_italia'] = df['m_op_danni_populismo_in_italia'].replace(diz_aliases_pop)

  df[var] = df[var].replace(diz_aliases_ac)
  df[var] = df[var].replace(diz_aliases_op)
  df[var] = df[var].replace(diz_aliases_trend)
  df['m_p_scolarita'] = df['m_p_scolarita'].replace(diz_aliases_scol)
  df['m_p_r_ampiezza6'] = df['m_p_r_ampiezza6'].replace(diz_aliases_geo)
  df['m_op_come_reddito_consenteDiVivere'] = df['m_op_come_reddito_consenteDiVivere'].replace(diz_aliases_eco)
  df[var] = df[var].replace(diz_aliases_freq_y)
  df[var] = df[var].replace(diz_aliases_freq)
  df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'] = df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'].replace(diz_aliases_comp)
  df['m_op_situazEconomic_propria_ultimi10anni'] = df['m_op_situazEconomic_propria_ultimi10anni'].replace(diz_aliases_situazEconomic)
  df['m_op_situazEconomic_futura'] = df['m_op_situazEconomic_futura'].replace(diz_aliases_futEconomic)
  df['m_op_legge_aborto_1987'] = df['m_op_legge_aborto_1987'].replace(diz_aliases_abo)
  df['m_op_favore_eutanasia_a_determinate_condizion

In [89]:
df.shape

(3319, 189)

In [90]:
df = df.fillna(0)

In [92]:
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('à', 'a')
df.columns = df.columns.str.replace('[^a-zA-Z0-9_]', '').str.replace(' ', '_')
df.columns = df.columns.str.replace('[()\']+|,\s*', '', regex=True)

  df.columns = df.columns.str.replace('[()\']+|,\s*', '', regex=True)


# Classifiers

In [93]:
# provo a rimuovere le righe con i valori Nan poiché solo 285
df = df.dropna()

In [94]:
X = df.drop(columns = ['m_p_int_voto'])
y = df.m_p_int_voto

In [95]:
X_numerico = X.select_dtypes(include=['float64', 'int64'])

In [96]:
# 1) Creiamo le maschere booleane
mask_train = X["m_anno_indagine"].isin([2017, 2018])   # True per 2017–18
mask_test  = X["m_anno_indagine"] == 2019              # True per 2019

# 2) Applichiamo le maschere
X_train = X.loc[mask_train].copy()
X_test  = X.loc[mask_test].copy()
y_train = y.loc[mask_train].copy()
y_test  = y.loc[mask_test].copy()

# 3) (Opzionale) Rimuovere 'm_anno_indagine' dalle feature
X_train = X_train.drop(columns=["m_anno_indagine"])
X_test  = X_test.drop(columns=["m_anno_indagine"])

# 4) (Opzionale) Reindicizzare
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [97]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)     # usa SOLO il train!
y_test_enc  = le.transform(y_test)          # da usare nel test finale

N_CLASSES = len(le.classes_)                # servirà a XGBoost / LightGBM


In [98]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2164, 187), (1155, 187), (2164,), (1155,))

In [102]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, average='macro')   # macro-F1 adatto al multiclasse

In [103]:
def objective(trial):
    model_name = trial.suggest_categorical('model', ['lgbm', 'xgb', 'dt', 'rf'])
    
    # ---------------- LIGHTGBM ----------------
    if model_name == 'lgbm':
        model = lgb.LGBMClassifier(
            objective='multiclass',
            num_class=N_CLASSES,
            n_jobs=4,
            random_state=42,
            learning_rate=trial.suggest_float('lgb_lr', 0.01, 0.2, log=True),
            n_estimators=trial.suggest_int('lgb_est', 100, 1000),
            num_leaves=trial.suggest_int('lgb_leaves', 31, 255),
            feature_fraction=trial.suggest_float('lgb_ff', 0.6, 1.0),
            bagging_fraction=trial.suggest_float('lgb_bf', 0.5, 1.0),
            bagging_freq=trial.suggest_int('lgb_freq', 0, 5),
            reg_alpha=trial.suggest_float('lgb_alpha', 1e-8, 10.0, log=True),
            reg_lambda=trial.suggest_float('lgb_lambda', 1e-8, 10.0, log=True)
        )

    # ---------------- XGBOOST ------------------
    elif model_name == 'xgb':
        model = XGBClassifier(
            objective='multi:softprob',
            num_class=N_CLASSES,
            tree_method='hist',
            n_jobs=4,
            random_state=42,
            eval_metric='mlogloss',
            learning_rate=trial.suggest_float('xgb_lr', 0.01, 0.2, log=True),
            n_estimators=trial.suggest_int('xgb_est', 100, 1000),
            max_depth=trial.suggest_int('xgb_depth', 3, 12),
            min_child_weight=trial.suggest_float('xgb_mc', 1e-3, 10.0, log=True),
            subsample=trial.suggest_float('xgb_sub', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('xgb_col', 0.5, 1.0),
            gamma=trial.suggest_float('xgb_gamma', 0.0, 5.0),
            reg_alpha=trial.suggest_float('xgb_alpha', 1e-8, 10.0, log=True),
            reg_lambda=trial.suggest_float('xgb_lambda', 1e-8, 10.0, log=True)
        )

    # ---------------- DECISION TREE -----------
    elif model_name == 'dt':
        model = DecisionTreeClassifier(
            random_state=42,
            criterion=trial.suggest_categorical('dt_crit', ['gini', 'entropy']),
            max_depth=trial.suggest_int('dt_depth', 2, 20),
            min_samples_split=trial.suggest_int('dt_split', 2, 20),
            min_samples_leaf=trial.suggest_int('dt_leaf', 1, 20)
        )

    # ---------------- RANDOM FOREST -----------
    else:  # 'rf'
        model = RandomForestClassifier(
            random_state=42,
            n_jobs=-1,
            n_estimators=trial.suggest_int('rf_estimators', 100, 1000),
            max_depth=trial.suggest_int('rf_depth', 5, 30),
            min_samples_split=trial.suggest_int('rf_split', 2, 20),
            min_samples_leaf=trial.suggest_int('rf_leaf', 1, 20),
            max_features=trial.suggest_categorical('rf_maxfeat',
                                                   ['sqrt', 'log2', None]),
            bootstrap=trial.suggest_categorical('rf_bootstrap',
                                                [True, False])
        )

    # ---- CV evaluation (macro-F1) ------------
    score = cross_val_score(model, X_train, y_train_enc,
                            cv=cv, scoring=scorer,
                            n_jobs=-1).mean()
    return score


In [106]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200, timeout=7200)   # ↑ trials se vuoi più esplorazione

print("Best macro-F1:", study.best_value)
print("Best params :", study.best_params)

[I 2025-07-24 16:48:42,458] A new study created in memory with name: no-name-0e7df849-aa5d-47e5-8ad2-1b76854d494c
[I 2025-07-24 16:49:11,336] Trial 0 finished with value: 0.6339363079138487 and parameters: {'model': 'lgbm', 'lgb_lr': 0.15447251587669514, 'lgb_est': 498, 'lgb_leaves': 131, 'lgb_ff': 0.9967782388148909, 'lgb_bf': 0.9844678729759267, 'lgb_freq': 3, 'lgb_alpha': 0.00017483369415221257, 'lgb_lambda': 5.707067700136496e-07}. Best is trial 0 with value: 0.6339363079138487.
[I 2025-07-24 16:49:55,607] Trial 1 finished with value: 0.6363449077875044 and parameters: {'model': 'lgbm', 'lgb_lr': 0.11332049602919164, 'lgb_est': 982, 'lgb_leaves': 251, 'lgb_ff': 0.6066160042906404, 'lgb_bf': 0.7850683607406344, 'lgb_freq': 5, 'lgb_alpha': 1.2807499533035894e-06, 'lgb_lambda': 0.010546114955456405}. Best is trial 1 with value: 0.6363449077875044.
[I 2025-07-24 16:49:57,089] Trial 2 finished with value: 0.6447342440105562 and parameters: {'model': 'xgb', 'xgb_lr': 0.08959006199962875,

Best macro-F1: 0.65869973165744
Best params : {'model': 'xgb', 'xgb_lr': 0.041861619819958854, 'xgb_est': 943, 'xgb_depth': 3, 'xgb_mc': 0.5056820696137888, 'xgb_sub': 0.5584747753574701, 'xgb_col': 0.7260036546135277, 'xgb_gamma': 2.6265768358914037, 'xgb_alpha': 0.0009304154943237876, 'xgb_lambda': 0.0015743977220746758}


In [107]:
best_trials = {}
for t in study.trials:                       # study è l'oggetto Optuna già ottimizzato
    mdl = t.params['model']
    if mdl not in best_trials or t.value > best_trials[mdl].value:
        best_trials[mdl] = t

print("Ho trovato:", list(best_trials.keys()))   # ['lgbm','xgb','rf','dt'] (in base ai tuoi risultati)


Ho trovato: ['lgbm', 'xgb', 'dt', 'rf']


In [108]:
best_trials

{'lgbm': FrozenTrial(number=42, state=1, values=[0.6492904390085987], datetime_start=datetime.datetime(2025, 7, 24, 17, 9, 45, 243990), datetime_complete=datetime.datetime(2025, 7, 24, 17, 9, 56, 963208), params={'model': 'lgbm', 'lgb_lr': 0.066954198416256, 'lgb_est': 250, 'lgb_leaves': 111, 'lgb_ff': 0.9409680053431105, 'lgb_bf': 0.5653057718207538, 'lgb_freq': 0, 'lgb_alpha': 0.8223034525325068, 'lgb_lambda': 1.4004064153182945}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'model': CategoricalDistribution(choices=('lgbm', 'xgb', 'dt', 'rf')), 'lgb_lr': FloatDistribution(high=0.2, log=True, low=0.01, step=None), 'lgb_est': IntDistribution(high=1000, log=False, low=100, step=1), 'lgb_leaves': IntDistribution(high=255, log=False, low=31, step=1), 'lgb_ff': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'lgb_bf': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'lgb_freq': IntDistribution(high=5, log=False, low=0, step=1), 'lgb_alpha': 

In [109]:
def build_estimator(mname, params, n_classes):
    if mname == 'lgbm':
        return lgb.LGBMClassifier(
            objective='multiclass', num_class=n_classes, n_jobs=4, random_state=42,
            learning_rate=params['lgb_lr'],
            n_estimators=params['lgb_est'],
            num_leaves=params['lgb_leaves'],
            feature_fraction=params['lgb_ff'],
            bagging_fraction=params['lgb_bf'],
            bagging_freq=params['lgb_freq'],
            reg_alpha=params['lgb_alpha'],
            reg_lambda=params['lgb_lambda']
        )
    if mname == 'xgb':
        return XGBClassifier(
            objective='multi:softprob', num_class=n_classes, n_jobs=4, random_state=42,
            eval_metric='mlogloss',
            learning_rate=params['xgb_lr'],
            n_estimators=params['xgb_est'],
            max_depth=params['xgb_depth'],
            min_child_weight=params['xgb_mc'],
            subsample=params['xgb_sub'],
            colsample_bytree=params['xgb_col'],
            gamma=params['xgb_gamma'],
            reg_alpha=params['xgb_alpha'],
            reg_lambda=params['xgb_lambda'],
            tree_method='hist'
        )
    if mname == 'rf':
        return RandomForestClassifier(
            random_state=42, n_jobs=-1,
            n_estimators=params['rf_estimators'],
            max_depth=params['rf_depth'],
            min_samples_split=params['rf_split'],
            min_samples_leaf=params['rf_leaf'],
            max_features=params['rf_maxfeat'],
            bootstrap=params['rf_bootstrap']
        )
    if mname == 'dt':
        return DecisionTreeClassifier(
            random_state=42,
            criterion=params['dt_crit'],
            max_depth=params['dt_depth'],
            min_samples_split=params['dt_split'],
            min_samples_leaf=params['dt_leaf']
        )



In [110]:
# ---------- 3. definisci lo scoring ----------
scoring = {
    'accuracy':  make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro', zero_division=0),
    'recall':    make_scorer(recall_score,    average='macro', zero_division=0),
    'f1':        make_scorer(f1_score,        average='macro', zero_division=0)
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# ---------- 4. loop e cross-validate ----------
cv_results = {}  
records = []
for mdl_name, trial in best_trials.items():
    est = build_estimator(mdl_name, trial.params, n_classes=len(le.classes_))
    cv_res = cross_validate(est, X_train, y_train_enc, cv=cv,
                            scoring=scoring, n_jobs=-1, return_train_score=False)
    
    cv_results[mdl_name.upper()] = cv_res   # <── salviamo!

    records.append({
        'Model': mdl_name.upper(),
        'Accuracy mean':  cv_res['test_accuracy'].mean(),
        'Accuracy SD':    cv_res['test_accuracy'].std(),
        'Precision mean': cv_res['test_precision'].mean(),
        'Precision SD':   cv_res['test_precision'].std(),
        'Recall mean':    cv_res['test_recall'].mean(),
        'Recall SD':      cv_res['test_recall'].std(),
        'F1 mean':        cv_res['test_f1'].mean(),
        'F1 SD':          cv_res['test_f1'].std()
    })

results_df = pd.DataFrame(records).set_index('Model').round(3)
display(results_df)   # in Jupyter / VS Code mostra la tabella


Unnamed: 0_level_0,Accuracy mean,Accuracy SD,Precision mean,Precision SD,Recall mean,Recall SD,F1 mean,F1 SD
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LGBM,0.669,0.04,0.656,0.041,0.651,0.042,0.649,0.042
XGB,0.678,0.028,0.664,0.029,0.659,0.028,0.659,0.028
DT,0.573,0.019,0.554,0.028,0.552,0.023,0.548,0.027
RF,0.647,0.022,0.632,0.023,0.623,0.022,0.617,0.024


In [111]:
# ---------- 1. prepara il contenitore risultati ----------
test_records = []

# ---------- 2. cicla sui modelli "vincitori" ----------
for mdl_name, trial in best_trials.items():
    
    # 2a. ricostruisci il modello con i parametri ottimali
    est = build_estimator(
        mdl_name,
        trial.params,
        n_classes=len(le.classes_)        # stesso LabelEncoder di prima
    )
    
    # 2b. allena su TUTTO il train (2017-2018)
    est.fit(X_train, y_train_enc)
    
    # 2c. predici sul test (2019)
    y_pred = est.predict(X_test)
    
    # 2d. calcola le metriche
    acc  = accuracy_score(y_test_enc, y_pred)
    prec = precision_score(y_test_enc, y_pred, average='macro', zero_division=0)
    rec  = recall_score(y_test_enc,  y_pred, average='macro', zero_division=0)
    f1   = f1_score(y_test_enc,      y_pred, average='macro', zero_division=0)
    
    test_records.append({
        'Model': mdl_name.upper(),
        'Accuracy':  acc,
        'Precision': prec,
        'Recall':    rec,
        'F1':        f1
    })
    
    # 2e. (opz.) stampa un report dettagliato
    print(f"\n=== {mdl_name.upper()} — Test 2019 ===")
    print(classification_report(
        y_test_enc, y_pred,
        target_names=le.classes_,
        zero_division=0
    ))
    # (opz.) cm = confusion_matrix(y_test_enc, y_pred)

# ---------- 3. tabella riassuntiva ----------
test_df = (
    pd.DataFrame(test_records)
      .set_index('Model')
      .round(3)
      .sort_values('F1', ascending=False)
)

display(test_df)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 779
[LightGBM] [Info] Number of data points in the train set: 2164, number of used features: 184
[LightGBM] [Info] Start training from score -1.096304
[LightGBM] [Info] Start training from score -1.316686
[LightGBM] [Info] Start training from score -0.921619

=== LGBM — Test 2019 ===
              precision    recall  f1-score   support

      Dx/CDx       0.71      0.70      0.71       379
         M5S       0.54      0.37      0.44       289
      Sx/CSx       0.72      0.86      0.78       487

    accuracy                           0.68      1155
   macro avg       0.66      0.64      0.64      1155
weighted avg       0.67      0.68      0.67      1155


=== XGB — Test 2019 ===
              precision    recall  f1-score   support


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGB,0.689,0.661,0.647,0.647
LGBM,0.685,0.655,0.643,0.642
RF,0.694,0.661,0.645,0.634
DT,0.663,0.623,0.624,0.619


In [112]:
# ---------------------------------------------------------
# 0.  Prerequisiti (già esistenti nel tuo notebook)
#     - best_trials        : dict con i parametri ottimali per ogni modello
#     - build_estimator()  : funzione che restituisce il modello dato il nome
#     - X_train, y_train_enc, X_test   : dati già pronti
# ---------------------------------------------------------

# Dizionario dove salveremo le predizioni sul test-set 2019
models_pred = {}

for mdl_name, trial in best_trials.items():
    # 1. ricostruisci il modello con i suoi parametri
    est = build_estimator(
        mdl_name, 
        trial.params, 
        n_classes=len(le.classes_)      # stesso LabelEncoder usato prima
    )

    # 2. allena sul 100 % del train (2017–2018)
    est.fit(X_train, y_train_enc)
    
    # 3. predici le etichette del test (2019) e salva
    models_pred[mdl_name.upper()] = est.predict(X_test)

# Ora hai:
y_pred_RF   = models_pred['RF']
y_pred_LGBM = models_pred['LGBM']
y_pred_XGB  = models_pred['XGB']
y_pred_DT   = models_pred['DT']

print({k: v.shape for k, v in models_pred.items()})
# ➜ {'RF': (1155,), 'LGBM': (1155,), 'XGB': (1155,), 'DT': (1155,)}


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 779
[LightGBM] [Info] Number of data points in the train set: 2164, number of used features: 184
[LightGBM] [Info] Start training from score -1.096304
[LightGBM] [Info] Start training from score -1.316686
[LightGBM] [Info] Start training from score -0.921619
{'LGBM': (1155,), 'XGB': (1155,), 'DT': (1155,), 'RF': (1155,)}


In [113]:
from scipy.stats import ttest_rel, wilcoxon

In [114]:
cv_res_LGBM = cv_results['LGBM']           # dict con tutte le metriche per fold
cv_res_RF = cv_results['RF']  
cv_res_XGB = cv_results['XGB']  
cv_res_DT = cv_results['DT']  

In [115]:
import itertools, numpy as np, pandas as pd
from scipy.stats import ttest_rel, wilcoxon
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)

# ──────────────────────────────────────────────────────────────
# 1.  F1 sulle 10 fold  →  t-test & Wilcoxon (tutte le coppie)
# ──────────────────────────────────────────────────────────────
cv_f1 = {
    'RF'  : cv_res_RF['test_f1'],
    'LGBM': cv_res_LGBM['test_f1'],
    'XGB' : cv_res_XGB['test_f1'],
    'DT'  : cv_res_DT['test_f1'],
}

rows_cv = []
for m1, m2 in itertools.combinations(cv_f1, 2):
    t_stat, p_t  = ttest_rel(cv_f1[m1], cv_f1[m2])
    w_stat, p_w  = wilcoxon(cv_f1[m1], cv_f1[m2])
    rows_cv.append({
        'Model 1': m1, 'Model 2': m2,
        'ΔF1 (mean)': np.mean(cv_f1[m1] - cv_f1[m2]),
        't-test p':   p_t,
        'Wilcoxon p': p_w,
    })

cv_tests_df = (pd.DataFrame(rows_cv)
               .sort_values('Wilcoxon p')
               .round(4))
display(cv_tests_df)

# ──────────────────────────────────────────────────────────────
# 2.  Bootstrap accoppiato 10 000× sul test 2019
#     → CI e p-value ΔF1   +   CI per tutte le metriche
# ──────────────────────────────────────────────────────────────
models_pred = {
    'RF'  : y_pred_RF,
    'LGBM': y_pred_LGBM,
    'XGB' : y_pred_XGB,
    'DT'  : y_pred_DT,
}

def metrics(y_true, y_hat):
    return dict(
        Accuracy  = accuracy_score(y_true, y_hat),
        Precision = precision_score(y_true, y_hat, average='macro', zero_division=0),
        Recall    = recall_score(   y_true, y_hat, average='macro', zero_division=0),
        F1        = f1_score(       y_true, y_hat, average='macro', zero_division=0)
    )

B   = 10_000
rng = np.random.default_rng(42)
n   = len(y_test_enc)

# container bootstrap
boot = {m: {k: [] for k in ['Accuracy','Precision','Recall','F1']}
        for m in models_pred}

for _ in range(B):
    idx = rng.integers(0, n, n)        # campione con reinserimento
    y_bs = y_test_enc[idx]
    for m, y_hat in models_pred.items():
        mtr = metrics(y_bs, y_hat[idx])
        for k, v in mtr.items():
            boot[m][k].append(v)

# ── tabella per modello: media + CI95
rows_summary = []
for m, d in boot.items():
    row = {'Model': m}
    for k in d:
        ci_low, ci_high = np.percentile(d[k], [2.5, 97.5])
        row[f'{k} mean'] = np.mean(d[k])
        row[f'{k} 95% CI'] = f"[{ci_low:.3f}, {ci_high:.3f}]"
    rows_summary.append(row)

summary_df = (pd.DataFrame(rows_summary)
              .set_index('Model')
              .round(3)
              .sort_values('F1 mean', ascending=False))
display(summary_df)

# ── ΔF1 paired-bootstrap per ogni coppia (come prima)
rows_bs = []
for m1, m2 in itertools.combinations(models_pred, 2):
    diff = np.array(boot[m1]['F1']) - np.array(boot[m2]['F1'])
    ci   = np.percentile(diff, [2.5, 97.5])
    p    = 2 * min((diff <= 0).mean(), (diff >= 0).mean())
    rows_bs.append({
        'Model 1': m1, 'Model 2': m2,
        'ΔF1 orig': metrics(y_test_enc, models_pred[m1])['F1']
                    - metrics(y_test_enc, models_pred[m2])['F1'],
        'ΔF1 95% CI': f"[{ci[0]:.3f}, {ci[1]:.3f}]",
        'Bootstrap p': p,
    })

bs_tests_df = (pd.DataFrame(rows_bs)
               .sort_values('Bootstrap p')
               .round(4))
display(bs_tests_df)


Unnamed: 0,Model 1,Model 2,ΔF1 (mean),t-test p,Wilcoxon p
1,RF,XGB,-0.0412,0.0008,0.002
2,RF,DT,0.0693,0.0001,0.002
4,LGBM,DT,0.1011,0.0,0.002
5,XGB,DT,0.1105,0.0,0.002
0,RF,LGBM,-0.0318,0.0083,0.0137
3,LGBM,XGB,-0.0094,0.4011,0.6953


Unnamed: 0_level_0,Accuracy mean,Accuracy 95% CI,Precision mean,Precision 95% CI,Recall mean,Recall 95% CI,F1 mean,F1 95% CI
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
XGB,0.689,"[0.661, 0.715]",0.661,"[0.630, 0.691]",0.647,"[0.621, 0.673]",0.646,"[0.617, 0.674]"
LGBM,0.685,"[0.658, 0.711]",0.656,"[0.625, 0.685]",0.643,"[0.617, 0.669]",0.641,"[0.613, 0.669]"
RF,0.693,"[0.667, 0.719]",0.661,"[0.628, 0.693]",0.645,"[0.621, 0.670]",0.634,"[0.606, 0.663]"
DT,0.663,"[0.635, 0.690]",0.623,"[0.592, 0.653]",0.624,"[0.597, 0.650]",0.618,"[0.590, 0.646]"


Unnamed: 0,Model 1,Model 2,ΔF1 orig,ΔF1 95% CI,Bootstrap p
5,XGB,DT,0.0279,"[-0.003, 0.060]",0.0788
4,LGBM,DT,0.023,"[-0.009, 0.055]",0.1458
2,RF,DT,0.0158,"[-0.010, 0.043]",0.2428
1,RF,XGB,-0.0121,"[-0.037, 0.012]",0.3268
0,RF,LGBM,-0.0072,"[-0.033, 0.019]",0.5828
3,LGBM,XGB,-0.0049,"[-0.024, 0.014]",0.6464


In [57]:
df.shape

(3117, 121)