In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
import optuna

In [None]:
from sklearn.linear_model import ElasticNet, HuberRegressor, Lasso, BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.base import BaseEstimator, RegressorMixin

In [None]:
#model training, base models
train_processed = pd.read_csv('train_processed.csv')
test_processed = pd.read_csv('test_processed.csv')

# define features and target variable
X = train_processed.drop(['SalePrice'], axis=1)
y = train_processed['SalePrice']  # log transformed
X_test = test_processed.copy()
test = pd.read_csv('test.csv')
test_ID = test['Id']  # save test IDs for submission

# split train and validation sets,这里需要分层划分,以避免famd出现问题
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 默认参数基模型训练（使用降维后数据）
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
print(f"RandomForest RMSE (default): {rf_rmse:.5f}")

# XGBoost
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
print(f"XGBoost RMSE (default): {xgb_rmse:.5f}")

# LightGBM
lgb = LGBMRegressor(random_state=42,verbose=-1)
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred))
print(f"LightGBM RMSE (default): {lgb_rmse:.5f}")

# CatBoost（降维后无需 categorical_features）
# define categorical features for CatBoost
categorical_features = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
                       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
                       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                       'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 
                       'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 
                       'GarageFinish', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 
                       'SaleCondition', 'Season']
catboost = CatBoostRegressor(random_state=42, verbose=0, cat_features=categorical_features)
catboost.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
catboost_pred = catboost.predict(X_val)
catboost_rmse = np.sqrt(mean_squared_error(y_val, catboost_pred))
print(f"CatBoost RMSE (default): {catboost_rmse:.5f}")

# enet
enet = ElasticNet(random_state=42)
enet.fit(X_train, y_train)
enet_pred = enet.predict(X_val)
enet_rmse = np.sqrt(mean_squared_error(y_val, enet_pred))
print(f"ElasticNet RMSE (default): {enet_rmse:.5f}")

# HuberRegressor
huber = HuberRegressor()
huber.fit(X_train, y_train)
huber_pred = huber.predict(X_val)
huber_rmse = np.sqrt(mean_squared_error(y_val, huber_pred))
print(f"HuberRegressor RMSE (default): {huber_rmse:.5f}")

# MLPRegressor
mlp = MLPRegressor(random_state=42, max_iter=1000)
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_val)
mlp_rmse = np.sqrt(mean_squared_error(y_val, mlp_pred))
print(f"MLPRegressor RMSE (default): {mlp_rmse:.5f}")

# KernelRidge
kr = KernelRidge()
kr.fit(X_train, y_train)
kr_pred = kr.predict(X_val)
kr_rmse = np.sqrt(mean_squared_error(y_val, kr_pred))
print(f"KernelRidge RMSE (default): {kr_rmse:.5f}")

# SVR
svr = SVR()
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_val)
svr_rmse = np.sqrt(mean_squared_error(y_val, svr_pred))
print(f"SVR RMSE (default): {svr_rmse:.5f}")

# Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_val)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_pred))
print(f"Ridge RMSE (default): {ridge_rmse:.5f}")

# Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_val)
lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_pred))
print(f"Lasso RMSE (default): {lasso_rmse:.5f}")

# GBR
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_pred = gbr.predict(X_val)
gbr_rmse = np.sqrt(mean_squared_error(y_val, gbr_pred))
print(f"GBR RMSE (default): {gbr_rmse:.5f}")

# bagging
bagging = BaggingRegressor()
bagging.fit(X_train, y_train)
bagging_pred = bagging.predict(X_val)
bagging_rmse = np.sqrt(mean_squared_error(y_val, bagging_pred))
print(f"Bagging RMSE (default): {bagging_rmse:.5f}")

# TabNet
tabnet = TabNetRegressor()
X_train_np = X_train.values
y_train_np = y_train.values.reshape(-1,1)
X_val_np = X_val.values
tabnet.fit(X_train_np, y_train_np)
tabnet_pred = tabnet.predict(X_val_np)
tabnet_rmse = np.sqrt(mean_squared_error(y_val, tabnet_pred))
print(f"TabNet RMSE (default): {tabnet_rmse:.5f}")


# BayesianRidge
bayesian_ridge = BayesianRidge()
bayesian_ridge.fit(X_train, y_train)
bayesian_ridge_pred = bayesian_ridge.predict(X_val)
bayesian_ridge_rmse = np.sqrt(mean_squared_error(y_val, bayesian_ridge_pred))
print(f"BayesianRidge RMSE (default): {bayesian_ridge_rmse:.5f}")


In [None]:
n_trials = 1000

In [None]:
# RandomForest optuna 调优
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'random_state': 42
    }
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials=200)  # RF 调优较慢，减少 trials
rf_best_params = rf_study.best_params
print("Best RandomForest params (Optuna):", rf_best_params)

# 重训练 Random Forest
rf_best = RandomForestRegressor(**rf_best_params, random_state=42)
rf_best.fit(X_train, y_train)
rf_best_pred = rf_best.predict(X_val)
rf_best_rmse = np.sqrt(mean_squared_error(y_val, rf_best_pred))
print(f"Optuna tuned RandomForest RMSE (reduced): {rf_best_rmse:.5f}")

In [None]:
# 使用 Optuna 调优（每个模型单独定义 objective）
# XGBoost Optuna 调优
def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'random_state': 42
    }
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=n_trials)
xgb_best_params = xgb_study.best_params
print("Best XGBoost params (Optuna):", xgb_best_params)

# 重训练 XGBoost
xgb_best = XGBRegressor(**xgb_best_params, random_state=42)
xgb_best.fit(X_train, y_train)
xgb_best_pred = xgb_best.predict(X_val)
xgb_best_rmse = np.sqrt(mean_squared_error(y_val, xgb_best_pred))
print(f"Optuna tuned XGBoost RMSE (reduced): {xgb_best_rmse:.5f}")

In [None]:
# LightGBM Optuna 调优
def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'random_state': 42
    }
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

lgb_study = optuna.create_study(direction='minimize')
lgb_study.optimize(lgb_objective, n_trials=n_trials)
lgb_best_params = lgb_study.best_params
print("Best LightGBM params (Optuna):", lgb_best_params)

# 重训练 LightGBM
lgb_best = LGBMRegressor(**lgb_best_params, random_state=42,verbose=-1)
lgb_best.fit(X_train, y_train)
lgb_best_pred = lgb_best.predict(X_val)
lgb_best_rmse = np.sqrt(mean_squared_error(y_val, lgb_best_pred))
print(f"Optuna tuned LightGBM RMSE (reduced): {lgb_best_rmse:.5f}")

In [None]:
# CatBoost Optuna 调优
def catboost_objective(trial):
    params = {
        'iterations': 500,  # 初始小值，之后可增大
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 7),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'random_state': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

catboost_study = optuna.create_study(direction='minimize')
catboost_study.optimize(catboost_objective, n_trials=n_trials)
catboost_best_params = catboost_study.best_params
catboost_best_params['iterations'] = 5000  # 增大 iterations 以利用最佳参数
print("Best CatBoost params (Optuna):", catboost_best_params)

# 重训练 CatBoost
catboost_best = CatBoostRegressor(**catboost_best_params, random_state=42, verbose=100)
catboost_best.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=200)
catboost_best_pred = catboost_best.predict(X_val)
catboost_best_rmse = np.sqrt(mean_squared_error(y_val, catboost_best_pred))
print(f"Optuna tuned CatBoost RMSE (reduced): {catboost_best_rmse:.5f}")

In [None]:
#  ElasticNet 调优
def enet_objective(trial):
    params = {
        'alpha': trial.suggest_float('alpha', 0.0001, 1.0, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
        'random_state': 42
    }
    model = ElasticNet(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

enet_study = optuna.create_study(direction='minimize')
enet_study.optimize(enet_objective, n_trials=n_trials)
enet_best_params = enet_study.best_params
print("Best ElasticNet params (Optuna):", enet_best_params)

# 重训练 ElasticNet
enet_best = ElasticNet(**enet_best_params, random_state=42)
enet_best.fit(X_train, y_train)
enet_best_pred = enet_best.predict(X_val)
enet_best_rmse = np.sqrt(mean_squared_error(y_val, enet_best_pred))
print(f"Optuna tuned ElasticNet RMSE: {enet_best_rmse:.5f}")

In [None]:
# Huber 调优
def objective_huber(trial):
    params = {
        'alpha': trial.suggest_float('alpha', 1e-4, 0.1, log=True),
        'epsilon': trial.suggest_float('epsilon', 1.0, 2.0),
        'max_iter': trial.suggest_int('max_iter', 2000, 5000)
    }
    model = HuberRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

study_huber = optuna.create_study(direction='minimize')
study_huber.optimize(objective_huber, n_trials=n_trials)
huber_best_params = study_huber.best_params
print("best Huber params:", study_huber.best_params)
print("best Huber RMSE:", study_huber.best_value)

# retrain
huber_best = HuberRegressor(**study_huber.best_params)
huber_best.fit(X_train, y_train)
huber_best_pred = huber_best.predict(X_val)
huber_best_rmse = np.sqrt(mean_squared_error(y_val, huber_best_pred))
print(f"Tuned Huber RMSE: {huber_best_rmse:.5f}")


In [None]:
# MLP 调优
def objective_mlp(trial):
    params = {
        'hidden_layer_sizes': trial.suggest_categorical('hidden_layer_sizes', [(50,100,50), (50,100,200,100,50), 
                                                                               (50,100,200,300,200,100,50), (100, 100),
                                                                               (100,200,100),(100,200,300,200,100),
                                                                               (100,200,300,400,300,200,100)]),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e-2, log=True),
        'learning_rate_init': trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True),
        'max_iter': trial.suggest_int('max_iter', 200, 1000)
    }
    model = MLPRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

study_mlp = optuna.create_study(direction='minimize')
study_mlp.optimize(objective_mlp, n_trials=n_trials)
mlp_best_params = study_mlp.best_params
print("Best MLP params:", study_mlp.best_params)
print("Best MLP RMSE:", study_mlp.best_value)

# retrain
mlp_best = MLPRegressor(**study_mlp.best_params, random_state=42)
mlp_best.fit(X_train, y_train)
mlp_best_pred = mlp_best.predict(X_val)
mlp_best_rmse = np.sqrt(mean_squared_error(y_val, mlp_best_pred))
print(f"Tuned MLP RMSE: {mlp_best_rmse:.5f}")


In [None]:
#  KernelRidge 调优
def kr_objective(trial):
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear'])
    params = {
        'alpha': trial.suggest_float('alpha', 0.1, 10.0, log=True),
        'kernel': kernel
    }
    if kernel == 'rbf':
        params['gamma'] = trial.suggest_float('gamma', 0.001, 0.1, log=True)
    model = KernelRidge(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

kr_study = optuna.create_study(direction='minimize')
kr_study.optimize(kr_objective, n_trials=n_trials)
kr_best_params = kr_study.best_params
print("Best KernelRidge params (Optuna):", kr_best_params)

# 重训练 KernelRidge
kr_best = KernelRidge(**kr_best_params)
kr_best.fit(X_train, y_train)
kr_best_pred = kr_best.predict(X_val)
kr_best_rmse = np.sqrt(mean_squared_error(y_val, kr_best_pred))
print(f"Optuna tuned KernelRidge RMSE: {kr_best_rmse:.5f}")

In [None]:
# SVR 调优
def svr_objective(trial):
    params = {
        'C': trial.suggest_float('C', 0.1, 100.0, log=True),
        'epsilon': trial.suggest_float('epsilon', 0.01, 0.5),
        'kernel': trial.suggest_categorical('kernel', ['rbf', 'linear'])
    }
    model = SVR(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, pred))

svr_study = optuna.create_study(direction='minimize')
svr_study.optimize(svr_objective, n_trials=n_trials)
svr_best_params = svr_study.best_params
print("Best SVR params (Optuna):", svr_best_params)

# 重训练 SVR
svr_best = SVR(**svr_best_params)
svr_best.fit(X_train, y_train)
svr_best_pred = svr_best.predict(X_val)
svr_best_rmse = np.sqrt(mean_squared_error(y_val, svr_best_pred))
print(f"Optuna tuned SVR RMSE: {svr_best_rmse:.5f}")

In [None]:
# Optuna optimization for Ridge
def objective_ridge(trial):
    alpha = trial.suggest_float('alpha', 0.1, 10.0, log=True)
    ridge = Ridge(alpha=alpha, random_state=42)
    ridge.fit(X_train, y_train)
    pred = ridge.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_ridge = optuna.create_study(direction='minimize')
study_ridge.optimize(objective_ridge, n_trials=n_trials)
print("Best Ridge params:", study_ridge.best_params)
print("Best Ridge RMSE:", study_ridge.best_value)

# retrain
ridge_best = Ridge(**study_ridge.best_params, random_state=42)
ridge_best.fit(X_train, y_train)
ridge_best_pred = ridge_best.predict(X_val)
ridge_best_rmse = np.sqrt(mean_squared_error(y_val, ridge_best_pred))
print(f"Tuned Ridge RMSE: {ridge_best_rmse:.5f}")

In [None]:
# Optuna optimization for Lasso
def objective_lasso(trial):
    alpha = trial.suggest_float('alpha', 0.0001, 0.1, log=True)
    lasso = Lasso(alpha=alpha, random_state=42)
    lasso.fit(X_train, y_train)
    pred = lasso.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_lasso = optuna.create_study(direction='minimize')
study_lasso.optimize(objective_lasso, n_trials=n_trials)
print("Best Lasso params:", study_lasso.best_params)
print("Best Lasso RMSE:", study_lasso.best_value)

# retrain
lasso_best = Lasso(**study_lasso.best_params, random_state=42)
lasso_best.fit(X_train, y_train)
lasso_best_pred = lasso_best.predict(X_val)
lasso_best_rmse = np.sqrt(mean_squared_error(y_val, lasso_best_pred))
print(f"Tuned Lasso RMSE: {lasso_best_rmse:.5f}")

In [None]:
# Optuna optimization for GradientBoosting
def objective_gbr(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0)
    }
    gbr = GradientBoostingRegressor(**params, random_state=42)
    gbr.fit(X_train, y_train)
    pred = gbr.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_gbr = optuna.create_study(direction='minimize')
study_gbr.optimize(objective_gbr, n_trials=n_trials)
print("Best GradientBoosting params:", study_gbr.best_params)
print("Best GradientBoosting RMSE:", study_gbr.best_value)

# retrain
gbr_best = GradientBoostingRegressor(**study_gbr.best_params, random_state=42)
gbr_best.fit(X_train, y_train)
gbr_best_pred = gbr_best.predict(X_val)
gbr_best_rmse = np.sqrt(mean_squared_error(y_val, gbr_best_pred))
print(f"Tuned GradientBoosting RMSE: {gbr_best_rmse:.5f}")

In [None]:
# Optuna optimization for Bagging
def objective_br(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0)
    }
    bagging = BaggingRegressor(**params, random_state=42)
    bagging.fit(X_train, y_train)
    pred = bagging.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_bagging = optuna.create_study(direction='minimize')
study_bagging.optimize(objective_br, n_trials=n_trials)
print("Best Bagging params:", study_bagging.best_params)
print("Best Bagging RMSE:", study_bagging.best_value)

# retrain
bagging_best = BaggingRegressor(**study_bagging.best_params, random_state=42)
bagging_best.fit(X_train, y_train)
bagging_best_pred = bagging_best.predict(X_val)
bagging_best_rmse = np.sqrt(mean_squared_error(y_val, bagging_best_pred))
print(f"Tuned Bagging RMSE: {bagging_best_rmse:.5f}")

In [None]:
# Optuna optimization for TabNet
def objective_tabnet(trial):
    params = {
        'n_d': trial.suggest_int('n_d', 8, 64),
        'n_a': trial.suggest_int('n_a', 8, 64),
        'n_steps': trial.suggest_int('n_steps', 3, 10),
        'gamma': trial.suggest_float('gamma', 1.0, 2.0),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 0.0001, 0.1, log=True)
    }
    tabnet = TabNetRegressor(**params, seed=42)
    tabnet.fit(
        X_train.values, y_train.values.reshape(-1, 1),
        eval_set=[(X_val.values, y_val.values.reshape(-1, 1))],
        eval_metric=['rmse'],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128
    )
    pred = tabnet.predict(X_val.values).flatten()
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_tabnet = optuna.create_study(direction='minimize')
study_tabnet.optimize(objective_tabnet, n_trials=n_trials)
print("Best TabNet params:", study_tabnet.best_params)
print("Best TabNet RMSE:", study_tabnet.best_value)

# retrain tabnet
tabnet_best = TabNetRegressor(**study_tabnet.best_params, seed=42)
tabnet_best.fit(
    X_train.values, y_train.values.reshape(-1, 1),
    eval_set=[(X_val.values, y_val.values.reshape(-1, 1))],
    eval_metric=['rmse'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128
)
tabnet_best_pred = tabnet_best.predict(X_val.values).flatten()
tabnet_best_rmse = np.sqrt(mean_squared_error(y_val, tabnet_best_pred))
print(f"Tuned TabNet RMSE: {tabnet_best_rmse:.5f}")

In [None]:
# Optuna optimization for bayesian ridge
def objective_br(trial):
    params = {
        'alpha_1': trial.suggest_float('alpha_1', 1e-6, 1e-1, log=True),
        'alpha_2': trial.suggest_float('alpha_2', 1e-6, 1e-1, log=True),
        'lambda_1': trial.suggest_float('lambda_1', 1e-6, 1e-1, log=True),
        'lambda_2': trial.suggest_float('lambda_2', 1e-6, 1e-1, log=True)
    }
    bayesian_ridge = BayesianRidge(**params)
    bayesian_ridge.fit(X_train, y_train)
    pred = bayesian_ridge.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    return rmse

# Optimize hyperparameters with Optuna
study_bayesian = optuna.create_study(direction='minimize')
study_bayesian.optimize(objective_br, n_trials=n_trials)
print("Best Bayesian params:", study_bayesian.best_params)
print("Best Bayesian RMSE:", study_bayesian.best_value)

# retrain
bayesian_best = BayesianRidge(**study_bayesian.best_params)
bayesian_best.fit(X_train, y_train)
bayesian_best_pred = bayesian_best.predict(X_val)
bayesian_best_rmse = np.sqrt(mean_squared_error(y_val, bayesian_best_pred))
print(f"Tuned Bayesian RMSE: {bayesian_best_rmse:.5f}")


In [None]:
print(f"1,best RF rmse:{rf_best_rmse}")
print(f"2,best XGBoost rmse:{xgb_best_rmse}")
print(f"3,best LightGBM rmse:{lgb_best_rmse}")
print(f"4,best CatBoost rmse:{catboost_best_rmse}")
print(f"5,best ElasticNet rmse:{enet_best_rmse}")
print(f"6,best Huber rmse:{huber_best_rmse}")
print(f"7,best MLP rmse:{mlp_best_rmse}")
print(f"8,best kernelRidge rmse: {kr_best_rmse}")
print(f"9,best SVR rmse:{svr_best_rmse}")
print(f"10,best ridge rmse:{ridge_best_rmse}")
print(f"11,best lasso rmse:{lasso_best_rmse}")
print(f"12,best gbr rmse: {gbr_best_rmse}")
print(f"13,best bagging rmse: {bagging_best_rmse}")
print(f"14,best tabnet rmse: {tabnet_best_rmse}")
print(f"15,best bayesian rmse: {bayesian_best_rmse}")


In [None]:
# # feature importance visualization
# # Random Forest feature importance
# rf_importance = pd.Series(rf_best.feature_importances_, index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# rf_importance[:30].plot(kind='bar')
# plt.title('Random Forest top 10 feature importance')
# plt.show()

# # # XGBoost feature importance
# # xgb_importance = pd.Series(xgb_best.feature_importances_, index=X.columns).sort_values(ascending=False)
# # plt.figure(figsize=(10, 6))
# # xgb_importance[:30].plot(kind='bar')
# # plt.title('XGBoost top 10 feature importance')
# # plt.show()

# # LightGBM feature importance
# lgb_importance = pd.Series(lgb_best.feature_importances_, index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# lgb_importance[:30].plot(kind='bar')
# plt.title('LightGBM top 10 feature importance')
# plt.show()

# # CatBoost feature importance
# catboost_importance = pd.Series(catboost_best.feature_importances_, index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# catboost_importance[:30].plot(kind='bar')
# plt.title('CatBoost top 10 feature importance')
# plt.show()

# # ElasticNet feature importance
# enet_importance = pd.Series(np.abs(enet_best.coef_), index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# enet_importance[:30].plot(kind='bar')
# plt.title('ElasticNet top 10 feature importance')
# plt.show()

# # Huber feature importance
# huber_importance = pd.Series(np.abs(huber_best.coef_), index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# huber_importance[:30].plot(kind='bar')
# plt.title('Huber top 10 feature importance')
# plt.show()

# # MLP feature importance
# perm_importance = permutation_importance(mlp_best, X_val, y_val, n_repeats=10, random_state=42)
# mlp_importance = pd.Series(perm_importance.importances_mean, index=X.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# mlp_importance[:30].plot(kind='bar')
# plt.title('MLP top 10 feature importance')
# plt.show()

# # KernelRidge feature importance, no feature importance available

# # SVR feature importance, no feature importance available

# # # Feature importance for GradientBoosting
# # gbr_importance = pd.Series(gbr_best.feature_importances_, index=X.columns).sort_values(ascending=False)
# # plt.figure(figsize=(10, 6))
# # gbr_importance[:10].plot(kind='bar')
# # plt.title('GradientBoosting Top 10 Feature Importance')
# # plt.show()


In [None]:
# # Random Forest cross-validation
# rf_cv_scores = cross_val_score(rf_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"Random Forest cross-validation RMSE: {-rf_cv_scores.mean():.5f} (+/- {rf_cv_scores.std() * 2:.5f})")

# # # XGBoost cross-validation
# # xgb_cv_scores = cross_val_score(xgb_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"XGBoost cross-validation RMSE: {-xgb_cv_scores.mean():.5f} (+/- {xgb_cv_scores.std() * 2:.5f})")

# # LightGBM cross-validation
# lgb_cv_scores = cross_val_score(lgb_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"LightGBM cross-validation RMSE: {-lgb_cv_scores.mean():.5f} (+/- {lgb_cv_scores.std() * 2:.5f})")

# # CatBoost cross-validation
# catboost_cv_scores = cross_val_score(catboost_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"CatBoost cross-validation RMSE: {-catboost_cv_scores.mean():.5f} (+/- {catboost_cv_scores.std() * 2:.5f})")

# # ElasticNet cross-validation
# enet_cv_scores = cross_val_score(enet_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"ElasticNet cross-validation RMSE: {-enet_cv_scores.mean():.5f} (+/- {enet_cv_scores.std() * 2:.5f})")

# # Huber cross-validation
# huber_cv_scores = cross_val_score(huber_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"Huber cross-validation RMSE: {-huber_cv_scores.mean():.5f} (+/- {huber_cv_scores.std() * 2:.5f})")

# # MLP cross-validation
# mlp_cv_scores = cross_val_score(mlp_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# print(f"MLP cross-validation RMSE: {-mlp_cv_scores.mean():.5f} (+/- {mlp_cv_scores.std() * 2:.5f})")

# # # KernelRidge cross-validation
# # kr_cv_scores = cross_val_score(kr_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"KernelRidge cross-validation RMSE: {-kr_cv_scores.mean():.5f} (+/- {kr_cv_scores.std() * 2:.5f})")

# # # SVR cross-validation
# # svr_cv_scores = cross_val_score(svr_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"SVR cross-validation RMSE: {-svr_cv_scores.mean():.5f} (+/- {svr_cv_scores.std() * 2:.5f})")

# # # Cross-validation
# # ridge_cv_scores = cross_val_score(ridge_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"Ridge CV RMSE: {-ridge_cv_scores.mean():.5f} (+/- {ridge_cv_scores.std() * 2:.5f})")

# # lasso_cv_scores = cross_val_score(lasso_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"Lasso CV RMSE: {-lasso_cv_scores.mean():.5f} (+/- {lasso_cv_scores.std() * 2:.5f})")

# # gbr_cv_scores = cross_val_score(gbr_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"GradientBoosting CV RMSE: {-gbr_cv_scores.mean():.5f} (+/- {gbr_cv_scores.std() * 2:.5f})")

# # br_cv_scores = cross_val_score(br_best, X, y, cv=5, scoring='neg_root_mean_squared_error')
# # print(f"Bagging CV RMSE: {-br_cv_scores.mean():.5f} (+/- {br_cv_scores.std() * 2:.5f})")


In [None]:
# predict on test set
rf_test_pred = rf_best.predict(X_test)
lgb_test_pred = lgb_best.predict(X_test)
enet_best_pred = enet_best.predict(X_test)
catboost_test_pred = catboost_best.predict(X_test)
huber_best_pred = huber_best.predict(X_test)
mlp_best_pred = mlp_best.predict(X_test)

# simple ensemble (average predictions)
final_pred = (catboost_test_pred + rf_test_pred + lgb_test_pred + 
              enet_best_pred + huber_best_pred + mlp_best_pred) / 6

# expm1 transformation to reverse log transformation
final_pred = np.expm1(final_pred)

# save submission file
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': final_pred})
submission.to_csv('submission_baseline_6Models.csv', index=False)

In [None]:
# define base models for stacking
base_models = [
    ('rf', RandomForestRegressor(**rf_best_params, random_state=42)),
    ('lgb', LGBMRegressor(**lgb_best_params, random_state=42, verbose=-1)),
    ('catboost', CatBoostRegressor(**catboost_best_params, random_state=42, verbose=0)),
    ('enet', ElasticNet(**enet_best_params, random_state=42)),
    ('huber', HuberRegressor(**huber_best_params)),
    ('mlp', MLPRegressor(**mlp_best_params, random_state=42))
]

# define meta learner
meta_learner = Ridge()
# first try use normal parameters for LGBMRegressor
# meta_learner = LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, 
#                             num_leaves=15, random_state=42)
# initia Stacking
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_learner, cv=5)

# train Stacking model
stacking_model.fit(X_train, y_train)

# validate Stacking model
stacking_pred = stacking_model.predict(X_val)
stacking_rmse = np.sqrt(mean_squared_error(y_val, stacking_pred))
print(f"Stacking RMSE: {stacking_rmse:.5f}")

# cross-validation
stacking_cv_scores = cross_val_score(stacking_model, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(f"Stacking cross-validation RMSE: {-stacking_cv_scores.mean():.5f} (+/- {stacking_cv_scores.std() * 2:.5f})")

# predict on test set
stacking_test_pred = stacking_model.predict(X_test)
stacking_test_pred = np.expm1(stacking_test_pred)

# save submission file
submission_stacking = pd.DataFrame({'Id': test_ID, 'SalePrice': stacking_test_pred})
submission_stacking.to_csv('submission_stacking_6Models.csv', index=False)