In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict,cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer, confusion_matrix, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import optuna


In [2]:
filepath = r"C:\Users\lucap\OneDrive\Desktop\PhD\Racheal\Dati\dataset_clean.csv"
df = pd.read_csv(filepath, sep = ";" )

In [3]:
diz_aliases_int_voto = {"Partito Democratico-PD":'Sx/CSx',
                "Partito Democratico":'Sx/CSx',
                "Lega con Salvini":'Dx/CDx',
                "Lega Nord":'Dx/CDx',
                "Lega":'Dx/CDx',
                "Forza Italia":'Dx/CDx',
                "Fratelli d'Italia":"Dx/CDx",
                'MoVimento 5 Stelle':'M5S',
                'Movimento 5 stelle':'M5S',
                'voterei  scheda bianca / annullerei la scheda':'astensione/bianca/nulla',
                'voterei scheda bianca / scheda nulla':'astensione/bianca/nulla',
                "+Europa":'Sx/CSx',
                "piu' Europa con Emma Bonino": "Sx/CSx",
                'Sinistra italiana (SEL + altri)':'Sx/CSx',
                'Potere al Popolo':'Sx/CSx',
                'Rifondazione Comunista':'Sx/CSx',
                "Fratelli d'Italia-Alleanza Nazionale&nbsp;":"Dx/CDx",
                'La Sinistra':'Sx/CSx',
                'Verdi':'Sx/CSx',
                'non andrei a votare':'astensione/bianca/nulla',
                'sono indeciso' : 'indecisi',
                'Liberi e Uguali' : 'Sx/CSx',
                'Italia dei Valori' : 'Sx/CSx',
                'Scelta Civica&nbsp;' : 'Sx/CSx',
                'Italia Unica di Corrado Passera' : 'Sx/CSx',
                "Noi con l'Italia UDC&nbsp;" : 'Dx/CDx',
                "altro partito di area di governo (SVP, Centro Democratico....)" : 'Altro partito',
                "Nuovo Centro Destra con UDC e PPI" : 'Dx/CDx'
                }

df['m_p_int_voto'] = df['m_p_int_voto'].replace(diz_aliases_int_voto)
df = df[~df['m_p_int_voto'].isin(['Altro partito', 'preferisco non rispondere', 'indecisi','astensione/bianca/nulla' ])]
df['m_p_int_voto'].value_counts()

m_p_int_voto
Sx/CSx    1348
Dx/CDx    1102
M5S        869
Name: count, dtype: int64

In [4]:
diz_aliases_ac = {"del tutto d'accordo":2,
            "del tutto<BR>d'accordo":2,
		     "d'accordo":1,
		     "ne' d'accordo ne' in disaccordo (NON STIMOLARE)":0,
             "ne d'accordo ne' in disaccordo":0,
             "ne' d'accordo ne' in disaccordo":0,
             "preferisco non rispondere":0,
             "preferisco<BR>non<BR> rispondere":0,
            "non saprei":0,
             "in disaccordo":-1,
             'del tutto in disaccordo':-2,
             "del tutto disaccordo":-2,
             "del tutto<BR>in disaccordo":-2,
             
            
}

nomi_var = df.columns.tolist()
for var in nomi_var:
    if '_ac_' in var:
        df[var] = df[var].replace(diz_aliases_ac)
        
        
diz_aliases_op = {"molto":2,
            "poco":-1,
            "pocao":-1,
		     "abbastanza":1,
		     "non saprei":0,
            "nessuno":-2,
            "per niente":-2}

nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'op' in var:
        df[var] = df[var].replace(diz_aliases_op)
        

diz_aliases_trend = {"in trend":1,
                 "slight in trend":1,
                 "slight off trend":0,
                 "off trend":0,
                 'non classificati':0}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'TREND' in var:
        df[var] = df[var].replace(diz_aliases_trend)
        
        
diz_aliases_scol = {"elementare/privo di titolo":1,
                 "media inferiore":1,
                 "diploma di maturita` (5 anni)":2,
                 "superiori in corso":2,
                 'diploma di istituto professionale (3 anni)':2,
                 'universita` in corso/nessuna laurea conseguita':4,
                 'laurea triennale di I livello':4,
                 'diploma universitario/laurea breve':4,
                 'laurea specialistica di II livello o laurea 4-5 anni':4,
                 'master/scuola di specializzazione post laurea':4,
                 'dottorato di ricerca':4
                 }
 
df['m_p_scolarita'] = df['m_p_scolarita'].replace(diz_aliases_scol)



diz_aliases_geo = {'meno di 5.000':1,
                 'da 5.001 a 10.000':1,
                 'da 10.001 a 30.000':2,
                 'da 30.001 a 100.000':2,
                 'da 100.001 a 250.000':3,
                 'piu` di 250.001':3
                 }
 
df['m_p_r_ampiezza6'] = df['m_p_r_ampiezza6'].replace(diz_aliases_geo)

diz_aliases_eco = {
                 "mi sento povero e non arrivo mai a fine mese":-1,
                 "avverto difficolta'":-1,
                 "arrivo a fine mese con molte difficolta'":-1,
                 "con tranquillita'":1,
                 'agiatamente':1
                 }
 
df['m_op_come_reddito_consenteDiVivere'] = df['m_op_come_reddito_consenteDiVivere'].replace(diz_aliases_eco)


df = pd.get_dummies(df, columns=['m_op_preferenza_lavorativa'], prefix='prf_lav',dtype=int)

#df = pd.get_dummies(df, columns=['m_p_zona_5istat'], prefix='zona')
df = pd.get_dummies(df, columns=['m_istat_reg'], prefix='reg',dtype=int)

diz_aliases_quot = {
                 0:0,
                 'preferisco non rispondere':0,
                 "si alcune volte alla settimana":1,
                 "si ogni giorno":1
                 }
 
df['m_p_lettura_quotidiani'] = df['m_p_lettura_quotidiani'].replace(diz_aliases_quot)

diz_aliases_freq_y = {'mai':0,
                     "piu' raramente":0,
                 "alcune volte l'anno":1,
                  "piu' volte alla settimana":2,
                 'mensile':1,
                 'settimanale':2}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_p_' in var:
        df[var] = df[var].replace(diz_aliases_freq_y)
        
        
diz_aliases_freq = {'non saprei':0,
                     "non ascolto la radio":0,
                 "da piu' di un'ora fino a due ore":1,
                  "da piu' di tre ore fino a quattro ore":2,
                 'oltre quattro':2,
                 "da piu' di due ore fino a tre ore":2,
                 "meno di un'ora":1}
 
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_p_' in var:
        df[var] = df[var].replace(diz_aliases_freq)
        

diz_aliases_comp = {
                 0:0,
                 'inferiore':-1,
                 'uguale':0,
                 "maggiore":1,
                 'nan':0
                 }
 
df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'] = df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'].replace(diz_aliases_comp)


diz_aliases_situazEconomic = {
    'rimasta la stessa': 0,
    'peggiorata': -1,
    'non sa/non risponde': 0,
    'nan': 0,
    'migliorata': 1
}
df['m_op_situazEconomic_propria_ultimi10anni'] = df['m_op_situazEconomic_propria_ultimi10anni'].replace(diz_aliases_situazEconomic)


diz_aliases_futEconomic = {
    'peggiorare': -1,
    'rimanere la stessa': 0,
    'non sa/non risponde': 0,
    'nan': 0,
    'migliorare': 1
}
df['m_op_situazEconomic_futura'] = df['m_op_situazEconomic_futura'].replace(diz_aliases_futEconomic)


diz_aliases_abo = {
    'una cattiva legge': -1,
    'una legge buona ma che va cambiata': 1,
    'una buona legge': 1,
    'non saprei/preferisco non rispondere': 0,
    'nan': 0
}

df['m_op_legge_aborto_1987'] = df['m_op_legge_aborto_1987'].replace(diz_aliases_abo)


diz_aliases_eut = {
    'no': -1,
    'si': 2,
    'non sa/non risponde': 0,
    'dipende dalle condizioni':1
}

df['m_op_favore_eutanasia_a_determinate_condizioni'] = df['m_op_favore_eutanasia_a_determinate_condizioni'].replace(diz_aliases_eut)


diz_aliases_identita = {
    "piu' europeo che italiano": 2,
    'sia italiano che europeo': 1,
    'solo italiano': -2,
    "piu' italiano che europeo": -1,
    'non sa - non risponde': 0,
    'solo europeo': 2,
    'nan': 0
}
df['m_op_sentimento_italianoVSeuropeo'] = df['m_op_sentimento_italianoVSeuropeo'].replace(diz_aliases_identita)


diz_aliases_modernizzazione = {
    'si sta modernizzando': 1,
    'sta regredendo': -1,
    'non sa/non risponde': 0,
    'nan': 0
}
df['m_modernizz_vs_regress_Paese'] = df['m_modernizz_vs_regress_Paese'].replace(diz_aliases_modernizzazione)


diz_aliases_vantaggi = {
    "piu' vantaggi": 1,
    'vantaggi e svantaggi in egual misura': 0,
    "piu' svantaggi": -1,
    "ne' vantaggi ne' svantaggi": 0,
    'non sa/non risponde': 0,
    'nan': None
}
df['m_op_partecipazione_in_UE'] = df['m_op_partecipazione_in_UE'].replace(diz_aliases_vantaggi)


diz_aliases_livello = {
    'bassa': -1,
    'media': 1,
    'alta': 2,
    0: 0,
    'nan': None
}
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_op_attenzione_' in var:
        df[var] = df[var].replace(diz_aliases_livello)
        
        
diz_aliases_andamento = {
    'rimasta uguale': 0,
    'peggiorata': -1,
    'migliorata': 1,
    0: 0,
    'nan': None
}
df['m_op_scuolaPubblica_ultimiAnni'] = df['m_op_scuolaPubblica_ultimiAnni'].replace(diz_aliases_andamento)


diz_aliases_quantita = {
    1: 1,
    'poca&nbsp;': -1,
    'molta': 2,
    'nessuna': -2,
    0: 0,
    'nan': None
}
nomi_var = df.columns.tolist()
for var in nomi_var:
    if 'm_op_fiducia_' in var:
        df[var] = df[var].replace(diz_aliases_quantita)
        
        
diz_aliases_postoStabile = {
    "d'accordo": 1,
    'in disaccordo': -1,
    "del tutto d'accordo": 2,
    'del tutto in disaccordo': -2,
    'preferisco non rispondere': 0,
    'nan': None
}
df['m_op_rinuncie_per_postoStabile'] = df['m_op_rinuncie_per_postoStabile'].replace(diz_aliases_postoStabile)


diz_aliases_pop = {
    'molti danni': 2,
    'nessun danno': 0,
    'qualche danno': 1,
    0: 0,
    'nan': None
}
df['m_op_danni_populismo_in_italia'] = df['m_op_danni_populismo_in_italia'].replace(diz_aliases_pop)

  df[var] = df[var].replace(diz_aliases_ac)
  df[var] = df[var].replace(diz_aliases_op)
  df[var] = df[var].replace(diz_aliases_trend)
  df['m_p_scolarita'] = df['m_p_scolarita'].replace(diz_aliases_scol)
  df['m_p_r_ampiezza6'] = df['m_p_r_ampiezza6'].replace(diz_aliases_geo)
  df['m_op_come_reddito_consenteDiVivere'] = df['m_op_come_reddito_consenteDiVivere'].replace(diz_aliases_eco)
  df[var] = df[var].replace(diz_aliases_freq_y)
  df[var] = df[var].replace(diz_aliases_freq)
  df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'] = df['m_op_disorientamento_realta_quotidiana_vs_3anniFa'].replace(diz_aliases_comp)
  df['m_op_situazEconomic_propria_ultimi10anni'] = df['m_op_situazEconomic_propria_ultimi10anni'].replace(diz_aliases_situazEconomic)
  df['m_op_situazEconomic_futura'] = df['m_op_situazEconomic_futura'].replace(diz_aliases_futEconomic)
  df['m_op_legge_aborto_1987'] = df['m_op_legge_aborto_1987'].replace(diz_aliases_abo)
  df['m_op_favore_eutanasia_a_determinate_condizion

In [5]:
df = df.fillna(0)
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('à', 'a')
df.columns = df.columns.str.replace('[^a-zA-Z0-9_]', '').str.replace(' ', '_')
df.columns = df.columns.str.replace('[()\']+|,\s*', '', regex=True)
df = df.dropna()

  df.columns = df.columns.str.replace('[()\']+|,\s*', '', regex=True)


In [6]:
X = df.drop(columns = ['m_p_int_voto'])
y = df.m_p_int_voto


# 1) Creiamo le maschere booleane
mask_train = X["m_anno_indagine"].isin([2017, 2018])   # True per 2017–18
mask_test  = X["m_anno_indagine"] == 2019              # True per 2019

# 2) Applichiamo le maschere
X_train = X.loc[mask_train].copy()
X_test  = X.loc[mask_test].copy()
y_train = y.loc[mask_train].copy()
y_test  = y.loc[mask_test].copy()

# 3) (Opzionale) Rimuovere 'm_anno_indagine' dalle feature
X_train = X_train.drop(columns=["m_anno_indagine"])
X_test  = X_test.drop(columns=["m_anno_indagine"])

# 4) (Opzionale) Reindicizzare
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)


le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)     # usa SOLO il train!
y_test_enc  = le.transform(y_test)          # da usare nel test finale

N_CLASSES = len(le.classes_)                # servirà a XGBoost / LightGBM

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2164, 187), (1155, 187), (2164,), (1155,))

In [13]:
# ───────────────────────── constants ────────────────────────────
macro_f1  = make_scorer(f1_score, average='macro', zero_division=0)
cv10      = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
window    = 2          # ± feature rispetto al best corrente
n_trials  = 10         # Optuna trials quando si attiva il tuning

best_fixed = dict(     # Optuna-best params da RQ1
    learning_rate    = 0.06695,
    n_estimators     = 250,
    num_leaves       = 111,
    feature_fraction = 0.9410,
    bagging_fraction = 0.5653,
    bagging_freq     = 0,
    reg_alpha        = 0.8223,
    reg_lambda       = 1.4004,
    objective        = 'multiclass',
    num_class        = len(le.classes_),
    verbose          = -1,            # silenzia LightGBM
    n_jobs           = 4,
    random_state     = 42
)

# ───────────────────────── loop RFE + tuning ─────────────────────
records          = []
best_seen_f1     = 0.0
best_n_feat      = len(X_train.columns)   # centro iniziale della finestra
df_train, df_test = X_train.copy(), X_test.copy()

for n_feat in range(best_n_feat, 10, -1):

    # 1) RFE
    rfe = RFE(
        lgb.LGBMClassifier(objective='multiclass',
                           num_class=len(le.classes_),
                           n_estimators=100,
                           learning_rate=0.1,
                           n_jobs=4,
                           random_state=42,
                           verbose=-1),
        n_features_to_select=n_feat,
        step=1
    ).fit(df_train, y_train_enc)

    X_tr = pd.DataFrame(rfe.transform(df_train),
                        columns=rfe.get_feature_names_out())
    X_te = pd.DataFrame(rfe.transform(df_test),
                        columns=rfe.get_feature_names_out())

    # 2) modello fisso
    clf_fixed = lgb.LGBMClassifier(**best_fixed).fit(X_tr, y_train_enc)
    f1_fixed  = f1_score(y_test_enc, clf_fixed.predict(X_te),
                         average='macro', zero_division=0)

    f1_final, best_params = f1_fixed, best_fixed

    # 3) tuning leggero SOLO se dentro la finestra
    if abs(n_feat - best_n_feat) <= window:

        def objective(trial):
            clf = lgb.LGBMClassifier(
                objective='multiclass', num_class=len(le.classes_),
                learning_rate    = trial.suggest_float('lr', 0.03, 0.15, log=True),
                n_estimators     = trial.suggest_int  ('n_est', 150, 400),
                num_leaves       = trial.suggest_int  ('leaves', 60, 180),
                feature_fraction = trial.suggest_float('ff', 0.80, 1.0),
                bagging_fraction = trial.suggest_float('bf', 0.80, 1.0),
                bagging_freq     = 1,
                verbose          = -1,
                n_jobs=4,
                random_state=42)
            return cross_val_score(clf, X_tr, y_train_enc,
                                   cv=cv10, scoring=macro_f1).mean()

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=n_trials, timeout=300)

        tuned = lgb.LGBMClassifier(
            objective        ='multiclass', num_class=len(le.classes_),
            learning_rate    = study.best_params['lr'],
            n_estimators     = study.best_params['n_est'],
            num_leaves       = study.best_params['leaves'],
            feature_fraction = study.best_params['ff'],
            bagging_fraction = study.best_params['bf'],
            bagging_freq     = 1,
            verbose          = -1,
            n_jobs=4,
            random_state=42).fit(X_tr, y_train_enc)

        f1_tuned = f1_score(y_test_enc, tuned.predict(X_te),
                            average='macro', zero_division=0)

        if f1_tuned > f1_final:
            f1_final, best_params = f1_tuned, study.best_params

    # 4) salva risultati
    records.append({
        'n_features': n_feat,
        'macro_f1'  : f1_final,
        'params'    : best_params,
        'features'  : ','.join(rfe.get_feature_names_out())
    })

    # 5) aggiorna best vista finora → sposta centro finestra
    if f1_final > best_seen_f1:
        best_seen_f1 = f1_final
        best_n_feat  = n_feat          # <-- nuovo centro per il window test

    print(f"{n_feat:3d} feat → F1 {f1_final:.3f}"
          f"   ({'tuned' if best_params is not best_fixed else 'fixed'})")

    # –– shrink dataset per prossima iterazione
    df_train, df_test = X_tr, X_te

# dataframe ordinato
df_rfe_results = (pd.DataFrame(records)
                  .sort_values('macro_f1', ascending=False)
                  .reset_index(drop=True))
print(df_rfe_results.head(10))

[I 2025-07-25 12:01:57,482] A new study created in memory with name: no-name-a5fb56ae-86fc-4702-96c0-215c396e58ab
[I 2025-07-25 12:02:03,400] Trial 0 finished with value: 0.6429172381772457 and parameters: {'lr': 0.08210939290261866, 'n_est': 154, 'leaves': 80, 'ff': 0.8601820632498985, 'bf': 0.8519771805022927}. Best is trial 0 with value: 0.6429172381772457.
[I 2025-07-25 12:02:15,045] Trial 1 finished with value: 0.637689247101702 and parameters: {'lr': 0.034648837796574614, 'n_est': 293, 'leaves': 142, 'ff': 0.957748268728959, 'bf': 0.8483963880336476}. Best is trial 0 with value: 0.6429172381772457.
[I 2025-07-25 12:02:25,279] Trial 2 finished with value: 0.6385168752434003 and parameters: {'lr': 0.1497386443977221, 'n_est': 323, 'leaves': 124, 'ff': 0.9838160648279206, 'bf': 0.940777548023366}. Best is trial 0 with value: 0.6429172381772457.
[I 2025-07-25 12:02:38,030] Trial 3 finished with value: 0.6345034997363568 and parameters: {'lr': 0.11289568504281941, 'n_est': 357, 'leave

187 feat → F1 0.658   (tuned)


[I 2025-07-25 12:04:34,895] A new study created in memory with name: no-name-67c92020-e9a1-48f2-8e61-605d0f0422e4
[I 2025-07-25 12:04:51,552] Trial 0 finished with value: 0.6358788106593664 and parameters: {'lr': 0.1262411140943918, 'n_est': 352, 'leaves': 75, 'ff': 0.9280342217534353, 'bf': 0.8564802489716332}. Best is trial 0 with value: 0.6358788106593664.
[I 2025-07-25 12:05:09,313] Trial 1 finished with value: 0.6368586536713698 and parameters: {'lr': 0.07451824845170008, 'n_est': 244, 'leaves': 134, 'ff': 0.9556576035225107, 'bf': 0.9850661203688595}. Best is trial 1 with value: 0.6368586536713698.
[I 2025-07-25 12:05:28,691] Trial 2 finished with value: 0.6317236324690277 and parameters: {'lr': 0.1032853512288235, 'n_est': 394, 'leaves': 128, 'ff': 0.8338794240963738, 'bf': 0.8296117905960713}. Best is trial 1 with value: 0.6368586536713698.
[I 2025-07-25 12:05:40,699] Trial 3 finished with value: 0.6354035404447852 and parameters: {'lr': 0.09376383659939455, 'n_est': 188, 'leav

186 feat → F1 0.649   (tuned)


[I 2025-07-25 12:07:28,305] A new study created in memory with name: no-name-d6d9c56e-ffd8-4458-b1d1-d196cef274ae
[I 2025-07-25 12:07:49,156] Trial 0 finished with value: 0.6394145162960747 and parameters: {'lr': 0.11679331424343149, 'n_est': 400, 'leaves': 142, 'ff': 0.812122261339663, 'bf': 0.9717730369849398}. Best is trial 0 with value: 0.6394145162960747.
[I 2025-07-25 12:08:15,835] Trial 1 finished with value: 0.6353672731932181 and parameters: {'lr': 0.0329769575836417, 'n_est': 378, 'leaves': 105, 'ff': 0.9037305445512691, 'bf': 0.9531752841145482}. Best is trial 0 with value: 0.6394145162960747.
[I 2025-07-25 12:08:27,156] Trial 2 finished with value: 0.6377531222558215 and parameters: {'lr': 0.09644260397369973, 'n_est': 172, 'leaves': 153, 'ff': 0.984250859075852, 'bf': 0.8421144419606921}. Best is trial 0 with value: 0.6394145162960747.
[I 2025-07-25 12:08:46,507] Trial 3 finished with value: 0.632256845737027 and parameters: {'lr': 0.10389557906739917, 'n_est': 347, 'leave

185 feat → F1 0.655   (tuned)
184 feat → F1 0.641   (fixed)
183 feat → F1 0.645   (fixed)
182 feat → F1 0.645   (fixed)
181 feat → F1 0.655   (fixed)
180 feat → F1 0.651   (fixed)
179 feat → F1 0.651   (fixed)
178 feat → F1 0.654   (fixed)
177 feat → F1 0.649   (fixed)
176 feat → F1 0.654   (fixed)
175 feat → F1 0.652   (fixed)
174 feat → F1 0.647   (fixed)
173 feat → F1 0.654   (fixed)
172 feat → F1 0.650   (fixed)
171 feat → F1 0.653   (fixed)
170 feat → F1 0.646   (fixed)
169 feat → F1 0.643   (fixed)
168 feat → F1 0.649   (fixed)
167 feat → F1 0.646   (fixed)
166 feat → F1 0.650   (fixed)
165 feat → F1 0.646   (fixed)
164 feat → F1 0.653   (fixed)
163 feat → F1 0.640   (fixed)
162 feat → F1 0.654   (fixed)
161 feat → F1 0.643   (fixed)
160 feat → F1 0.645   (fixed)
159 feat → F1 0.656   (fixed)
158 feat → F1 0.650   (fixed)
157 feat → F1 0.655   (fixed)
156 feat → F1 0.647   (fixed)
155 feat → F1 0.650   (fixed)
154 feat → F1 0.655   (fixed)
153 feat → F1 0.647   (fixed)
152 feat →

[I 2025-07-25 12:11:56,763] A new study created in memory with name: no-name-05601d32-79ac-43ca-b529-87bdfc52ccb4
[I 2025-07-25 12:12:14,295] Trial 0 finished with value: 0.6493360469390289 and parameters: {'lr': 0.09390984440212148, 'n_est': 245, 'leaves': 101, 'ff': 0.9822064310726485, 'bf': 0.9169603585404204}. Best is trial 0 with value: 0.6493360469390289.
[I 2025-07-25 12:12:44,144] Trial 1 finished with value: 0.6408475452930803 and parameters: {'lr': 0.054292084604534045, 'n_est': 391, 'leaves': 158, 'ff': 0.9383764415081686, 'bf': 0.9683313721654914}. Best is trial 0 with value: 0.6493360469390289.
[I 2025-07-25 12:12:57,695] Trial 2 finished with value: 0.6469461755094394 and parameters: {'lr': 0.12388497350510097, 'n_est': 207, 'leaves': 87, 'ff': 0.8985666864840789, 'bf': 0.8805297003995272}. Best is trial 0 with value: 0.6493360469390289.
[I 2025-07-25 12:13:10,976] Trial 3 finished with value: 0.6373985831068057 and parameters: {'lr': 0.08756812039191184, 'n_est': 193, 'l

142 feat → F1 0.659   (fixed)


[I 2025-07-25 12:15:15,689] A new study created in memory with name: no-name-8d6571ee-39ba-4214-9ef0-0ae1eab2a945
[I 2025-07-25 12:15:27,967] Trial 0 finished with value: 0.6455354242852264 and parameters: {'lr': 0.06707856914845972, 'n_est': 192, 'leaves': 119, 'ff': 0.8106210438710407, 'bf': 0.8790333997324561}. Best is trial 0 with value: 0.6455354242852264.
[I 2025-07-25 12:15:43,145] Trial 1 finished with value: 0.650562855061512 and parameters: {'lr': 0.12020720554694599, 'n_est': 236, 'leaves': 105, 'ff': 0.8312682230902537, 'bf': 0.947470032851588}. Best is trial 1 with value: 0.650562855061512.
[I 2025-07-25 12:16:00,331] Trial 2 finished with value: 0.6402126367872641 and parameters: {'lr': 0.08289060857190741, 'n_est': 239, 'leaves': 127, 'ff': 0.9925511846557399, 'bf': 0.9576488259666935}. Best is trial 1 with value: 0.650562855061512.
[I 2025-07-25 12:16:19,065] Trial 3 finished with value: 0.6443396272952546 and parameters: {'lr': 0.03467938592509002, 'n_est': 286, 'leave

141 feat → F1 0.650   (tuned)


[I 2025-07-25 12:34:42,074] A new study created in memory with name: no-name-33612f5f-e81e-4822-93e4-baaf3c7941cd
[I 2025-07-25 12:34:55,472] Trial 0 finished with value: 0.6471836063856201 and parameters: {'lr': 0.10505946962205212, 'n_est': 174, 'leaves': 142, 'ff': 0.8330625046905816, 'bf': 0.817045309541973}. Best is trial 0 with value: 0.6471836063856201.
[I 2025-07-25 12:35:18,767] Trial 1 finished with value: 0.6398860795404635 and parameters: {'lr': 0.08099322422691148, 'n_est': 319, 'leaves': 122, 'ff': 0.9819931363937201, 'bf': 0.9307951056037154}. Best is trial 0 with value: 0.6471836063856201.
[I 2025-07-25 12:35:47,504] Trial 2 finished with value: 0.6335283073590177 and parameters: {'lr': 0.0458296678592061, 'n_est': 373, 'leaves': 102, 'ff': 0.9789716646036084, 'bf': 0.9923352432872912}. Best is trial 0 with value: 0.6471836063856201.
[I 2025-07-25 12:36:01,362] Trial 3 finished with value: 0.63471672603008 and parameters: {'lr': 0.06422606923592689, 'n_est': 251, 'leave

140 feat → F1 0.656   (fixed)
139 feat → F1 0.648   (fixed)
138 feat → F1 0.650   (fixed)
137 feat → F1 0.639   (fixed)
136 feat → F1 0.648   (fixed)
135 feat → F1 0.650   (fixed)
134 feat → F1 0.655   (fixed)
133 feat → F1 0.651   (fixed)
132 feat → F1 0.663   (fixed)


[I 2025-07-25 12:37:54,781] A new study created in memory with name: no-name-83640ecc-d060-445f-b4fe-f6df6bf402d6
[I 2025-07-25 12:38:16,279] Trial 0 finished with value: 0.6393572339451217 and parameters: {'lr': 0.030785339288137537, 'n_est': 353, 'leaves': 167, 'ff': 0.9241842136317832, 'bf': 0.825051476156914}. Best is trial 0 with value: 0.6393572339451217.
[I 2025-07-25 12:38:38,423] Trial 1 finished with value: 0.6407921442361403 and parameters: {'lr': 0.04733729254751853, 'n_est': 353, 'leaves': 129, 'ff': 0.9621637102113924, 'bf': 0.8244694159798277}. Best is trial 1 with value: 0.6407921442361403.
[I 2025-07-25 12:38:54,799] Trial 2 finished with value: 0.6387664777701397 and parameters: {'lr': 0.1200429170400909, 'n_est': 304, 'leaves': 100, 'ff': 0.8982458340449012, 'bf': 0.9907142158536423}. Best is trial 1 with value: 0.6407921442361403.
[I 2025-07-25 12:39:15,688] Trial 3 finished with value: 0.6408557164913569 and parameters: {'lr': 0.11605919329403037, 'n_est': 395, 'le

131 feat → F1 0.644   (fixed)


[I 2025-07-25 12:41:09,278] A new study created in memory with name: no-name-905bb360-72bd-407e-a576-363b97e8eb05
[I 2025-07-25 12:41:19,758] Trial 0 finished with value: 0.6480500076902197 and parameters: {'lr': 0.10419576076742465, 'n_est': 179, 'leaves': 100, 'ff': 0.8295608862457406, 'bf': 0.8902145279990251}. Best is trial 0 with value: 0.6480500076902197.
[I 2025-07-25 12:41:43,520] Trial 1 finished with value: 0.6435026401616091 and parameters: {'lr': 0.051501101633062964, 'n_est': 372, 'leaves': 137, 'ff': 0.8066617319648539, 'bf': 0.9055997970893953}. Best is trial 0 with value: 0.6480500076902197.
[I 2025-07-25 12:41:57,858] Trial 2 finished with value: 0.6540888183456368 and parameters: {'lr': 0.09608542868831335, 'n_est': 282, 'leaves': 84, 'ff': 0.9703751104922562, 'bf': 0.8142498846810734}. Best is trial 2 with value: 0.6540888183456368.
[I 2025-07-25 12:42:21,136] Trial 3 finished with value: 0.6405678335997543 and parameters: {'lr': 0.07006980020222689, 'n_est': 355, 'l

130 feat → F1 0.651   (fixed)
129 feat → F1 0.645   (fixed)
128 feat → F1 0.650   (fixed)
127 feat → F1 0.660   (fixed)
126 feat → F1 0.640   (fixed)
125 feat → F1 0.648   (fixed)
124 feat → F1 0.655   (fixed)
123 feat → F1 0.643   (fixed)
122 feat → F1 0.654   (fixed)
121 feat → F1 0.656   (fixed)
120 feat → F1 0.656   (fixed)
119 feat → F1 0.646   (fixed)
118 feat → F1 0.650   (fixed)
117 feat → F1 0.649   (fixed)
116 feat → F1 0.656   (fixed)
115 feat → F1 0.646   (fixed)
114 feat → F1 0.653   (fixed)
113 feat → F1 0.648   (fixed)
112 feat → F1 0.644   (fixed)
111 feat → F1 0.655   (fixed)
110 feat → F1 0.647   (fixed)
109 feat → F1 0.661   (fixed)
108 feat → F1 0.653   (fixed)
107 feat → F1 0.651   (fixed)
106 feat → F1 0.648   (fixed)
105 feat → F1 0.648   (fixed)
104 feat → F1 0.650   (fixed)
103 feat → F1 0.652   (fixed)
102 feat → F1 0.647   (fixed)
101 feat → F1 0.654   (fixed)
100 feat → F1 0.658   (fixed)
 99 feat → F1 0.655   (fixed)
 98 feat → F1 0.655   (fixed)
 97 feat →

[I 2025-07-25 12:44:55,409] A new study created in memory with name: no-name-d5e41712-6e4d-4445-904e-6d2af7b48268
[I 2025-07-25 12:45:08,387] Trial 0 finished with value: 0.6386276257552665 and parameters: {'lr': 0.07427172942454764, 'n_est': 234, 'leaves': 101, 'ff': 0.8979120522650862, 'bf': 0.9538093663035175}. Best is trial 0 with value: 0.6386276257552665.
[I 2025-07-25 12:45:26,245] Trial 1 finished with value: 0.6460869814165779 and parameters: {'lr': 0.05520621302399474, 'n_est': 316, 'leaves': 172, 'ff': 0.8011210628621557, 'bf': 0.9790194506982101}. Best is trial 1 with value: 0.6460869814165779.
[I 2025-07-25 12:45:35,409] Trial 2 finished with value: 0.6460746902916716 and parameters: {'lr': 0.057007789684516016, 'n_est': 180, 'leaves': 147, 'ff': 0.9781072418671564, 'bf': 0.8025595208843762}. Best is trial 1 with value: 0.6460869814165779.
[I 2025-07-25 12:45:50,535] Trial 3 finished with value: 0.6475751214866996 and parameters: {'lr': 0.04274134603561201, 'n_est': 255, '

 92 feat → F1 0.658   (fixed)


[I 2025-07-25 12:47:30,311] A new study created in memory with name: no-name-2225818b-7863-4d15-bf39-fc194186e60b
[I 2025-07-25 12:47:42,695] Trial 0 finished with value: 0.6418916562883669 and parameters: {'lr': 0.0997716841857355, 'n_est': 230, 'leaves': 157, 'ff': 0.941145096209389, 'bf': 0.9024716149348565}. Best is trial 0 with value: 0.6418916562883669.
[I 2025-07-25 12:47:58,049] Trial 1 finished with value: 0.6497045428778081 and parameters: {'lr': 0.0426407715050069, 'n_est': 321, 'leaves': 170, 'ff': 0.8077527706735418, 'bf': 0.8541333354707361}. Best is trial 1 with value: 0.6497045428778081.
[I 2025-07-25 12:48:16,624] Trial 2 finished with value: 0.640048644697322 and parameters: {'lr': 0.04310085092637243, 'n_est': 332, 'leaves': 132, 'ff': 0.9898258783798861, 'bf': 0.9485544304641479}. Best is trial 1 with value: 0.6497045428778081.
[I 2025-07-25 12:48:33,533] Trial 3 finished with value: 0.6462697146114422 and parameters: {'lr': 0.08443674237475358, 'n_est': 348, 'leave

 91 feat → F1 0.653   (fixed)
 90 feat → F1 0.649   (fixed)
 89 feat → F1 0.659   (fixed)
 88 feat → F1 0.662   (fixed)
 87 feat → F1 0.656   (fixed)
 86 feat → F1 0.644   (fixed)
 85 feat → F1 0.643   (fixed)
 84 feat → F1 0.651   (fixed)
 83 feat → F1 0.652   (fixed)
 82 feat → F1 0.656   (fixed)
 81 feat → F1 0.657   (fixed)
 80 feat → F1 0.658   (fixed)
 79 feat → F1 0.649   (fixed)
 78 feat → F1 0.651   (fixed)
 77 feat → F1 0.646   (fixed)
 76 feat → F1 0.652   (fixed)
 75 feat → F1 0.649   (fixed)
 74 feat → F1 0.644   (fixed)
 73 feat → F1 0.649   (fixed)
 72 feat → F1 0.651   (fixed)
 71 feat → F1 0.651   (fixed)
 70 feat → F1 0.645   (fixed)
 69 feat → F1 0.642   (fixed)
 68 feat → F1 0.650   (fixed)
 67 feat → F1 0.648   (fixed)
 66 feat → F1 0.642   (fixed)
 65 feat → F1 0.641   (fixed)
 64 feat → F1 0.635   (fixed)
 63 feat → F1 0.633   (fixed)
 62 feat → F1 0.630   (fixed)
 61 feat → F1 0.642   (fixed)
 60 feat → F1 0.634   (fixed)
 59 feat → F1 0.637   (fixed)
 58 feat →

In [14]:
print(df_rfe_results.head(10))

   n_features  macro_f1                                             params  \
0          93  0.668505  {'learning_rate': 0.06695, 'n_estimators': 250...   
1         132  0.662707  {'learning_rate': 0.06695, 'n_estimators': 250...   
2          88  0.662093  {'learning_rate': 0.06695, 'n_estimators': 250...   
3         109  0.661323  {'learning_rate': 0.06695, 'n_estimators': 250...   
4         127  0.659596  {'learning_rate': 0.06695, 'n_estimators': 250...   
5          89  0.658968  {'learning_rate': 0.06695, 'n_estimators': 250...   
6         142  0.658902  {'learning_rate': 0.06695, 'n_estimators': 250...   
7         143  0.658716  {'learning_rate': 0.06695, 'n_estimators': 250...   
8          95  0.658571  {'learning_rate': 0.06695, 'n_estimators': 250...   
9          80  0.658477  {'learning_rate': 0.06695, 'n_estimators': 250...   

                                            features  
0  m_p_r_eta,m_p_pubblico_privato,m_p_r_ampiezza6...  
1  m_sesso,m_p_r_eta,m_p_scolar

In [15]:
# --- 1. pick the best row (already sorted by macro_F1) -------------
best_row   = df_rfe_results.iloc[0]

best_f1    = best_row['macro_f1']
best_size  = best_row['n_features']
best_params   = best_row['params']            # dict
best_columns  = best_row['features'].split(',')  # list[str]

print(f"Best subset = {best_size} features  |  macro-F1 = {best_f1:.3f}")
print("\nHyper-parameters:")
for k, v in best_params.items():
    print(f"  {k:<15} : {v}")

print("\nSelected columns:")
for col in best_columns:
    print("  ", col)


Best subset = 93 features  |  macro-F1 = 0.669

Hyper-parameters:
  learning_rate   : 0.06695
  n_estimators    : 250
  num_leaves      : 111
  feature_fraction : 0.941
  bagging_fraction : 0.5653
  bagging_freq    : 0
  reg_alpha       : 0.8223
  reg_lambda      : 1.4004
  objective       : multiclass
  num_class       : 3
  verbose         : -1
  n_jobs          : 4
  random_state    : 42

Selected columns:
   m_p_r_eta
   m_p_pubblico_privato
   m_p_r_ampiezza6
   m_p_frequenza_teatro
   m_p_frequenza_concerti
   m_p_frequenza_mostre
   m_p_frequenza_palestra_sport
   m_p_radio_ore
   m_ac_privilegio_lavoratori_sett_pubblico
   m_ac_propensioneRischio_italia_vs_europa
   m_ac_importanza_partiti
   m_ac_diminuzione_ruolo_partiti
   m_ac_affidamento_pubblica_tecnici
   m_ac_valori_resistenza_altra_epoca
   m_ac_sindacato_ancoraUtile
   m_ac_uguaglianza_sociale_frena_individui
   m_ac_troppo_focus_uguaglianza_vs_merito
   m_ac_meglio_uguaglianza_vs_merito_singolo
   m_op_disorientament

In [16]:
import json

# 2. save params as JSON
with open("best_lgbm_params.json", "w") as fp:
    json.dump(best_params, fp, indent=2)

# 3. save column list as CSV (one column name per line)
pd.Series(best_columns).to_csv("best_feature_subset.csv", index=False, header=False)
