In [None]:
import pandas as pd
import numpy as np
import optuna
import warnings
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

#importazione dei dataset
raw_data = fetch_california_housing()
data = pd.DataFrame(raw_data.data, columns=raw_data.feature_names)
data['MedHouseVal'] = raw_data.target

data.info()

#rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = data[col].quantile(0.3)
    Q3 = data[col].quantile(0.7)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)

data = data.dropna()

#features combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']
data['OccupPerPerson'] = data['AveOccup'] / (data['Population'] + 1e-5)
data['RoomsPerPerson'] = data['AveRooms'] / (data['Population'] + 1e-5)
data['BedroomsPerPerson'] = data['AveBedrms'] / (data['Population'] + 1e-5)
data['Lat_Long'] = data['Latitude'] * data['Longitude']




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [18]:
def objective(trial):
    # Split interno per validazione durante il tuning
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Spazio di ricerca
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning

        'tree_method': 'gpu_hist',   # Usa il metodo basato su GPU
        'predictor': 'gpu_predictor', # Predizioni anch’esse su GPU
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 15.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 15.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
    }

    # Pruning basato su RMSE
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        evals=[(dvalid, "validation")],
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
        verbose_eval=False
    )

    # Optuna deve minimizzare RMSE
    preds = model.predict(dvalid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    return rmse


In [19]:
# ==========================================
# 1. Dataset e Split
# ==========================================
X = data.drop(columns=["MedHouseVal"])
y = data["MedHouseVal"]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

print(f"Dataset Shape: {X.shape}")
print(f"Target medio: {np.mean(y):.2f} (centinaia di k$)")

# ==========================================
# 2. Baseline (Senza Tuning)
# ==========================================
print("\n--- Training Modello Baseline (Default) ---")
dtrain_full = xgb.DMatrix(X_train_scaled, label=y_train_full)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

params_base = {
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',        
    'predictor': 'gpu_predictor',     
    'random_state': 42
}

model_base = xgb.train(params_base, dtrain_full, num_boost_round=100)
preds_base = model_base.predict(dtest)
rmse_base = np.sqrt(mean_squared_error(y_test, preds_base))
print(f"Baseline RMSE: {rmse_base:.4f}")

# ==========================================
# 2b. Generazione pseudo-label filtrate
# ==========================================
pseudo_labels = model_base.predict(dtest)

# Filtro di confidenza: solo valori vicini alla media del training
mean_train = np.mean(y_train_full)
std_train = np.std(y_train_full)
confident_mask = (pseudo_labels > mean_train - std_train) & (pseudo_labels < mean_train + std_train)

X_pseudo_confident = X_test_scaled[confident_mask]
y_pseudo_confident = pseudo_labels[confident_mask]

print(f"Numero di pseudo-label confident: {len(y_pseudo_confident)} su {len(y_test)}")

# Combiniamo dati reali + pseudo confident
X_combined = np.vstack([X_train_scaled, X_pseudo_confident])
y_combined = np.concatenate([y_train_full, y_pseudo_confident])

# ==========================================
# 3. Funzione Objective con GPU su dati combinati
# ==========================================
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'max_depth': trial.suggest_int('max_depth', 10, 13),
        'min_child_weight': trial.suggest_int('min_child_weight', 15, 23),
        'gamma': trial.suggest_float('gamma', 1e-3, 0.01),
        'subsample': trial.suggest_float('subsample', 0.75, 0.85),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.85, 0.91),
        'lambda': trial.suggest_float('lambda', 0.1, 1.0),
        'alpha': trial.suggest_float('alpha', 1.0, 2.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.04, 0.07),
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=2000,
        evals=[(dvalid, "validation")],
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
        verbose_eval=False
    )

    preds_val = model.predict(dvalid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds_val))
    return rmse

# ==========================================
# 4. Ottimizzazione con Optuna
# ==========================================
print("\n--- Inizio Tuning con Optuna (GPU + pseudo-label) ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=1200)

print(f"\nMiglior RMSE trovato da Optuna: {study.best_value:.4f}")
print("Migliori parametri:", study.best_params)

# ==========================================
# 5. Training Modello Finale
# ==========================================
best_params = study.best_params
best_params.update({
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
})
best_params['learning_rate'] *= 0.5

dtrain_combined = xgb.DMatrix(X_combined, label=y_combined)
num_round_final = 5000

model_final = xgb.train(
    best_params,
    dtrain_combined,
    num_boost_round=num_round_final,
    evals=[(dtest, "test")],
    early_stopping_rounds=100,
    verbose_eval=False
)

# ==========================================
# 6. Risultati finali
# ==========================================
preds_final = model_final.predict(dtest)
rmse_final = np.sqrt(mean_squared_error(y_test, preds_final))
r2_final = r2_score(y_test, preds_final)

print(f"RMSE finale con pseudo-label confident: {rmse_final:.4f}")
print(f"R2 finale con pseudo-label confident: {r2_final:.4f}")


Dataset Shape: (13561, 14)
Target medio: 1.85 (centinaia di k$)

--- Training Modello Baseline (Default) ---


[I 2025-12-09 14:50:37,451] A new study created in memory with name: no-name-4ec4167a-abe5-4032-957d-df872e9e9e74


Baseline RMSE: 0.3555
Numero di pseudo-label confident: 1853 su 2713

--- Inizio Tuning con Optuna (GPU + pseudo-label) ---


[I 2025-12-09 14:50:41,436] Trial 0 finished with value: 0.3243772419059875 and parameters: {'max_depth': 13, 'min_child_weight': 20, 'gamma': 0.0029116610566531477, 'subsample': 0.7746123177865296, 'colsample_bytree': 0.8631785585285412, 'lambda': 0.2508904105538494, 'alpha': 1.9230427579001175, 'learning_rate': 0.05109379599479066}. Best is trial 0 with value: 0.3243772419059875.
[I 2025-12-09 14:50:44,548] Trial 1 finished with value: 0.3263151648212046 and parameters: {'max_depth': 12, 'min_child_weight': 22, 'gamma': 0.0018826017660091108, 'subsample': 0.8262421180916442, 'colsample_bytree': 0.9021568379167271, 'lambda': 0.39634359150915976, 'alpha': 1.999545456221698, 'learning_rate': 0.058940929242491724}. Best is trial 0 with value: 0.3243772419059875.
[I 2025-12-09 14:50:48,577] Trial 2 finished with value: 0.32488464023869995 and parameters: {'max_depth': 11, 'min_child_weight': 22, 'gamma': 0.007187368860336961, 'subsample': 0.8157814391570931, 'colsample_bytree': 0.90398800


Miglior RMSE trovato da Optuna: 0.3232
Migliori parametri: {'max_depth': 12, 'min_child_weight': 20, 'gamma': 0.004359119694098261, 'subsample': 0.7917713055778127, 'colsample_bytree': 0.850500548839292, 'lambda': 0.6010411962875345, 'alpha': 1.6073335711972097, 'learning_rate': 0.062057322551791624}
RMSE finale con pseudo-label confident: 0.3349
R2 finale con pseudo-label confident: 0.8391


**RISULTATO**

RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4300

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 25.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 25.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
    }

study.optimize(objective, n_trials=70, timeout=600)

=============================================================================== <br>
RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4310

con scaler

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-9, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-6, 20.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 20.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.8),
    }

study.optimize(objective, n_trials=200, timeout=600)

=============================================================================== <br>

RMSE Baseline:    0.3522
RMSE Ottimizzato: 0.3379

con scaler

study.optimize(objective, n_trials=200, timeout=600)

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 15.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 15.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.6),
    }

#rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = data[col].quantile(0.3)
    Q3 = data[col].quantile(0.7)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)

data = data.dropna()

#features combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']

Miglior RMSE trovato da Optuna (Validation): 0.3544
Migliori parametri: {'max_depth': 13, 'min_child_weight': 23, 'gamma': 3.8838420870962745e-06, 'subsample': 0.7570628531701007, 'colsample_bytree': 0.9109736816364161, 'lambda': 0.02307771384295654, 'alpha': 0.9454327224813569, 'learning_rate': 0.06885420667941514}