In [None]:
import pandas as pd
import numpy as np
import optuna
import warnings
import xgboost as xgb
import lightgbm as lgb
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings('ignore')

# ==========================================
# 1. Importazione e preprocessing
# ==========================================
raw_data = fetch_california_housing()
data = pd.DataFrame(raw_data.data, columns=raw_data.feature_names)
data['MedHouseVal'] = raw_data.target

# Rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.756)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)
data = data.dropna()

# Feature combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']
data['OccupPerPerson'] = data['AveOccup'] / (data['Population'] + 1e-5)
data['RoomsPerPerson'] = data['AveRooms'] / (data['Population'] + 1e-5)
data['BedroomsPerPerson'] = data['AveBedrms'] / (data['Population'] + 1e-5)
data['Lat_Long'] = data['Latitude'] * data['Longitude']

# ==========================================
# 2. Split e scaling
# ==========================================
X = data.drop(columns=["MedHouseVal"])
y = data["MedHouseVal"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==========================================
# 3. Baseline XGBoost
# ==========================================
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

params_base = {
    'objective': 'reg:squarederror',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'random_state': 42
}

model_base = xgb.train(params_base, dtrain, num_boost_round=100)
preds_base = model_base.predict(dtest)
rmse_base = np.sqrt(mean_squared_error(y_test, preds_base))
r2_base = r2_score(y_test, preds_base)
print(f"RMSE Baseline XGBoost: {rmse_base:.4f}, R2: {r2_base:.4f}")

# ==========================================
# 4. Pseudo-labeling
# ==========================================
preds_test = model_base.predict(dtest)
confident_idx = np.argsort(np.abs(preds_test - np.mean(preds_test)))[:int(0.25*len(preds_test))]

X_pseudo = X_test_scaled[confident_idx]
y_pseudo = preds_test[confident_idx]

X_combined = np.vstack([X_train_scaled, X_pseudo])
y_combined = np.hstack([y_train.values, y_pseudo])

print(f"Training con pseudo-label: {X_combined.shape[0]} campioni")

# ==========================================
# 5. Hyperparameter tuning XGBoost
# ==========================================
def objective_xgb(trial):
    X_tr, X_val, y_tr, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
    dtrain_trial = xgb.DMatrix(X_tr, label=y_tr)
    dvalid_trial = xgb.DMatrix(X_val, label=y_val)
    
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-7, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-7, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2)
    }
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")
    model = xgb.train(param, dtrain_trial, num_boost_round=2000, evals=[(dvalid_trial, "validation")],
                      early_stopping_rounds=100, callbacks=[pruning_callback], verbose_eval=False)
    
    preds_val = model.predict(dvalid_trial)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    return rmse

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50)

best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
})

best_params_xgb['learning_rate'] *= 0.5
model_xgb_final = xgb.train(best_params_xgb, xgb.DMatrix(X_combined, label=y_combined),
                            num_boost_round=5000, evals=[(dtest, "test")],
                            early_stopping_rounds=100, verbose_eval=False)

preds_xgb_final = model_xgb_final.predict(dtest)
rmse_xgb_final = np.sqrt(mean_squared_error(y_test, preds_xgb_final))
r2_xgb_final = r2_score(y_test, preds_xgb_final)
print(f"XGBoost finale pseudo-label: RMSE={rmse_xgb_final:.4f}, R2={r2_xgb_final:.4f}")

# ==========================================
# 6. Hyperparameter tuning LightGBM
# ==========================================
def objective_lgbm(trial):
    X_tr, X_val, y_tr, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
    
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50)
    }
    
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        param,
        train_data,
        num_boost_round=5000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    
    preds_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    return rmse

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=50)

best_params_lgbm = study_lgbm.best_params
best_params_lgbm.update({'objective':'regression','metric':'rmse','verbosity':-1})

# Training finale LGBM
train_lgbm_final = lgb.Dataset(X_combined, label=y_combined)
valid_lgbm = lgb.Dataset(X_test_scaled, label=y_test, reference=train_lgbm_final)

model_lgbm_final = lgb.train(
    best_params_lgbm,
    train_lgbm_final,
    num_boost_round=5000,
    valid_sets=[valid_lgbm],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=False),
        lgb.log_evaluation(0)
    ]
)

preds_lgbm_final = model_lgbm_final.predict(X_test_scaled)
rmse_lgbm_final = np.sqrt(mean_squared_error(y_test, preds_lgbm_final))
r2_lgbm_final = r2_score(y_test, preds_lgbm_final)
print(f"LightGBM finale pseudo-label: RMSE={rmse_lgbm_final:.4f}, R2={r2_lgbm_final:.4f}")

# ==========================================
# 7. Confronto finale
# ==========================================
print("-"*40)
print(f"Baseline XGBoost: RMSE={rmse_base:.4f}, R2={r2_base:.4f}")
print(f"XGBoost finale:   RMSE={rmse_xgb_final:.4f}, R2={r2_xgb_final:.4f}")
print(f"LightGBM finale:  RMSE={rmse_lgbm_final:.4f}, R2={r2_lgbm_final:.4f}")


Shapes: train (16512, 11) test (4128, 11)
Training ensemble for uncertainty estimation...


[I 2025-12-09 16:45:55,856] A new study created in memory with name: no-name-6ea733b0-5079-42b0-b0d6-cb0043001b6e


Ensemble-base mean RMSE: 0.4317, R2: 0.8576
Selected 1032 pseudo-labels (lowest uncertainty)
After range filter: 1032 pseudo-labels kept
Combined training shape: (17544, 11)


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-09 16:46:00,210] Trial 0 finished with value: 0.4582714444845233 and parameters: {'max_depth': 13, 'min_child_weight': 2, 'subsample': 0.8230824575171154, 'colsample_bytree': 0.7839284218668187, 'eta': 0.029943869879060125, 'gamma': 0.28879669856801526, 'lambda': 0.7292464707684743, 'alpha': 9.439934601355491e-05}. Best is trial 0 with value: 0.4582714444845233.
[I 2025-12-09 16:46:09,581] Trial 1 finished with value: 0.44250239616179254 and parameters: {'max_depth': 6, 'min_child_weight': 19, 'subsample': 0.6947389816235756, 'colsample_bytree': 0.6221244775481586, 'eta': 0.017189859118766165, 'gamma': 0.00027292220105410266, 'lambda': 3.078306250593259e-06, 'alpha': 9.925905435678324e-05}. Best is trial 1 with value: 0.44250239616179254.
[I 2025-12-09 16:46:26,748] Trial 2 finished with value: 0.4523786896289224 and parameters: {'max_depth': 13, 'min_child_weight': 11, 'subsample': 0.9416011491529339, 'colsample_bytree': 0.8157178372337522, 'eta': 0.021785040848999543, 'gam

KeyboardInterrupt: 

**RISULTATO**

RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4300

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 25.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 25.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
    }

study.optimize(objective, n_trials=70, timeout=600)

=============================================================================== <br>
RMSE Baseline:    0.4718
RMSE Ottimizzato: 0.4310

con scaler

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-9, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-6, 20.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 20.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.8),
    }

study.optimize(objective, n_trials=200, timeout=600)

=============================================================================== <br>

RMSE Baseline:    0.3522
RMSE Ottimizzato: 0.3379

con scaler

study.optimize(objective, n_trials=200, timeout=600)

param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse', # Metrica per early stopping e pruning
        'tree_method': 'hist',
        
        # --- Struttura ---
        # California ha interazioni complesse, permettiamo alberi più profondi
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        # Importante per evitare overfitting su outlier di prezzo
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 25),
        'gamma': trial.suggest_float('gamma', 1e-7, 1.5, log=True),
        
        # --- Randomness ---
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        
        # --- Regolarizzazione ---
        # Fondamentale nella regressione per non inseguire i prezzi estremi
        'lambda': trial.suggest_float('lambda', 1e-9, 15.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-9, 15.0, log=True),
        
        # --- Learning ---
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.6),
    }

#rimozione outlier
numeric_cols = data.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = data[col].quantile(0.3)
    Q3 = data[col].quantile(0.7)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    data[col] = np.where(data[col].between(lower, upper), data[col], np.nan)

data = data.dropna()

#features combinate
data['Ave_Room_Bed'] = data['AveRooms'] / data['AveBedrms']
data['HouseRooms'] = data['HouseAge'] / data['AveRooms']

Miglior RMSE trovato da Optuna (Validation): 0.3544
Migliori parametri: {'max_depth': 13, 'min_child_weight': 23, 'gamma': 3.8838420870962745e-06, 'subsample': 0.7570628531701007, 'colsample_bytree': 0.9109736816364161, 'lambda': 0.02307771384295654, 'alpha': 0.9454327224813569, 'learning_rate': 0.06885420667941514}