In [32]:
import pandas as pd
import numpy as np
import optuna
import warnings
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [33]:
#importazione dei dataset
raw_data = fetch_california_housing()
data = pd.DataFrame(raw_data.data, columns=raw_data.feature_names)
data['MedHouseVal'] = raw_data.target

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [34]:
#rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = data[col].quantile(0.3)
    Q3 = data[col].quantile(0.7)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)

data = data.dropna()

#features combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']


In [35]:
def objective(trial):
    # Split interno per validazione durante il tuning
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Spazio di ricerca
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 15.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 15.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.6),
    }

    # Pruning basato su RMSE
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        evals=[(dvalid, "validation")],
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
        verbose_eval=False
    )

    # Optuna deve minimizzare RMSE
    preds = model.predict(dvalid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse


In [36]:
#X, y = data.data, data.target
X = data.drop(columns=["MedHouseVal"])
y= data["MedHouseVal"]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

print(f"Dataset Shape: {X.shape}")
print(f"Target medio: {np.mean(y):.2f} (centinaia di k$)")

# ==========================================
# 2. Baseline (Senza Tuning)
# ==========================================
print("\n--- Training Modello Baseline (Default) ---")
dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parametri standard di default
params_base = {
    'objective': 'reg:squarederror', # Regressione
    'tree_method': 'hist',
    'random_state': 42
}

model_base = xgb.train(params_base, dtrain_full, num_boost_round=100)
preds_base = model_base.predict(dtest)

# Calcolo RMSE Baseline
rmse_base = np.sqrt(mean_squared_error(y_test, preds_base))
print(f"Baseline RMSE: {rmse_base:.4f}")


# ==========================================
# 3. Ottimizzazione con Optuna
# ==========================================
print("\n--- Inizio Tuning con Optuna ---")

# Creazione Studio (Minimizzare RMSE)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200, timeout=600)

print(f"\nMiglior RMSE trovato da Optuna (Validation): {study.best_value:.4f}")
print("Migliori parametri:", study.best_params)


# ==========================================
# 4. Training Modello Finale (Best Params + Shrinkage)
# ==========================================
print("\n--- Training Modello Finale Ottimizzato ---")

best_params = study.best_params
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'
best_params['tree_method'] = 'hist'

# STRATEGIA "SHRINKAGE" (Raffiniamo la discesa del gradiente)
# Abbassiamo il learning rate trovato per maggiore precisione finale
best_params['learning_rate'] = best_params['learning_rate'] * 0.5 
# Aumentiamo gli alberi di conseguenza (safety buffer alto, early stopping gestirà il resto)
num_round_final = 5000 

model_opt = xgb.train(
    best_params, 
    dtrain_full, 
    num_boost_round=num_round_final,
    evals=[(dtest, "test")], # Test set usato SOLO per fermare il training al punto giusto
    early_stopping_rounds=100,
    verbose_eval=False
)

# ==========================================
# 5. Analisi Risultati
# ==========================================
print("\n--- CONFRONTO FINALE ---")

preds_opt = model_opt.predict(dtest)
rmse_opt = np.sqrt(mean_squared_error(y_test, preds_opt))

# R2 Score (coefficiente di determinazione)
r2_base = r2_score(y_test, preds_base)
r2_opt = r2_score(y_test, preds_opt)

print(f"RMSE Baseline:    {rmse_base:.4f}")
print(f"RMSE Ottimizzato: {rmse_opt:.4f}")
print(f"--> Miglioramento Errore: {rmse_base - rmse_opt:.4f} (Minore è meglio)")
print("-" * 30)
print(f"R2 Score Baseline:    {r2_base:.4f}")
print(f"R2 Score Ottimizzato: {r2_opt:.4f}")
print(f"--> Varianza Spiegata Extra: +{(r2_opt - r2_base)*100:.2f}%")

Dataset Shape: (13561, 10)
Target medio: 1.85 (centinaia di k$)

--- Training Modello Baseline (Default) ---


[I 2025-12-09 12:43:44,046] A new study created in memory with name: no-name-2d667dd0-6378-4475-b738-2bbee370f45a


Baseline RMSE: 0.3522

--- Inizio Tuning con Optuna ---


[I 2025-12-09 12:43:44,803] Trial 0 finished with value: 0.39503958285676205 and parameters: {'max_depth': 16, 'min_child_weight': 15, 'gamma': 0.0004331696672311377, 'subsample': 0.6697426843004406, 'colsample_bytree': 0.6969508372673816, 'lambda': 1.4013164237638986e-05, 'alpha': 6.571085941177902e-06, 'learning_rate': 0.1843866995673897}. Best is trial 0 with value: 0.39503958285676205.
[I 2025-12-09 12:43:45,058] Trial 1 finished with value: 0.40832971819430897 and parameters: {'max_depth': 9, 'min_child_weight': 2, 'gamma': 0.015511235821291068, 'subsample': 0.8723785166629177, 'colsample_bytree': 0.8154185789974243, 'lambda': 1.1935998704130544e-09, 'alpha': 0.004731079259346015, 'learning_rate': 0.3941470330447055}. Best is trial 0 with value: 0.39503958285676205.
[I 2025-12-09 12:43:46,721] Trial 2 finished with value: 0.35441231638452836 and parameters: {'max_depth': 13, 'min_child_weight': 23, 'gamma': 3.8838420870962745e-06, 'subsample': 0.7570628531701007, 'colsample_bytree


Miglior RMSE trovato da Optuna (Validation): 0.3544
Migliori parametri: {'max_depth': 13, 'min_child_weight': 23, 'gamma': 3.8838420870962745e-06, 'subsample': 0.7570628531701007, 'colsample_bytree': 0.9109736816364161, 'lambda': 0.02307771384295654, 'alpha': 0.9454327224813569, 'learning_rate': 0.06885420667941514}

--- Training Modello Finale Ottimizzato ---

--- CONFRONTO FINALE ---
RMSE Baseline:    0.3522
RMSE Ottimizzato: 0.3379
--> Miglioramento Errore: 0.0143 (Minore è meglio)
------------------------------
R2 Score Baseline:    0.8221
R2 Score Ottimizzato: 0.8362
--> Varianza Spiegata Extra: +1.41%


**RISULTATO**

RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4300

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 25.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 25.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
    }

study.optimize(objective, n_trials=70, timeout=600)

=============================================================================== <br>
RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4310

con scaler

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-9, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-6, 20.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 20.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.8),
    }

study.optimize(objective, n_trials=200, timeout=600)

=============================================================================== <br>

RMSE Baseline:    0.3522
RMSE Ottimizzato: 0.3379

con scaler

study.optimize(objective, n_trials=200, timeout=600)

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 15.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 15.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.6),
    }

#rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = data[col].quantile(0.3)
    Q3 = data[col].quantile(0.7)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)

data = data.dropna()

#features combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']

Miglior RMSE trovato da Optuna (Validation): 0.3544
Migliori parametri: {'max_depth': 13, 'min_child_weight': 23, 'gamma': 3.8838420870962745e-06, 'subsample': 0.7570628531701007, 'colsample_bytree': 0.9109736816364161, 'lambda': 0.02307771384295654, 'alpha': 0.9454327224813569, 'learning_rate': 0.06885420667941514}