In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
import optuna


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import StackingRegressor
from scipy import stats
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import time
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import SelectKBest, f_regression

from xgboost import XGBRegressor

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [None]:
test = pd.read_csv('good_test_data.csv')
train = pd.read_csv('good_train_data.csv')
test_df = test.copy()
train_df = train.copy()

In [None]:
y_train = train_df['SalePrice']
X_train = train_df.drop('SalePrice', axis=1)
X_test = test_df.copy()

In [None]:
numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.drop('SalePrice')
categorical = train_df.select_dtypes(include=['object']).columns

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

numerical = numerical.astype(str)
categorical = categorical.astype(str)

In [None]:
num_imputer = SimpleImputer(strategy='mean')
X_train_num = num_imputer.fit_transform(X_train[numerical])
X_test_num = num_imputer.transform(X_test[numerical])

In [None]:
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

In [None]:
X_train_cat = X_train[categorical].copy()
X_test_cat = X_test[categorical].copy()

for col in categorical:
    X_train_cat[col] = X_train[col].astype('category')
    X_test_cat[col] = X_test[col].astype('category')

In [None]:
X_train = pd.concat([
    pd.DataFrame(X_train_num, columns=numerical, index=X_train.index),
    X_train_cat
], axis=1)

X_test = pd.concat([
    pd.DataFrame(X_test_num, columns=numerical, index=X_test.index),
    X_test_cat
], axis=1)

In [None]:
X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(X_train, y_train, test_size=0.35, random_state=42)


def objective(trial):
  params = {
      'n_estimators': trial.suggest_int('n_estimators', 300, 1500),
      'max_depth': trial.suggest_int('max_depth', 3, 10),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
      'subsample': trial.suggest_float('subsample', 0.6, 1.0),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), #0.3 - 0.6
      'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
      'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
  }
  xgb = XGBRegressor(**params,
                      tree_method='hist',
                      enable_categorical=True,
                      random_state=42,
                  # n_estimators = 1000,
                  # learning_rate = 0.03,
                  # max_depth = 2,
                  # subsample = 0.7,
                  # colsample_bytree = 0.7
                  )
  # xgb.fit(X_train_xgb, y_train_xgb)
  # print("Best XGBoost Params:", xgb)
  # y_preds_xgb = xgb.predict(X_val_xgb)
  # print("Validation R^2:", r2_score(y_val_xgb, y_preds_xgb))
  # rmsle = np.sqrt(mean_squared_log_error(y_val_xgb, y_preds_xgb))
  # print(f"Validation RMSLE: {rmsle:.5f}")
  # rmsle
  kf = KFold(n_splits=3, shuffle=True, random_state=42)
  scores = cross_val_score(xgb, X_train_xgb, y_train_xgb, scoring='neg_mean_squared_log_error', cv=kf)


  return np.sqrt(-scores.mean())

In [None]:
# [I 2025-04-14 21:39:02,460] Trial 8 finished with value: 0.009802669320849998 and parameters:
#  {'n_estimators': 445, 'max_depth': 4, 'learning_rate': 0.020088633268618115, 'subsample': 0.6473813619539747,
# 'colsample_bytree': 0.6200060506685562, 'reg_alpha': 0.18429363114878394, 'reg_lambda': 1.8368057913572426}. Best is trial 8 with value: 0.009802669320849998.

# [I 2025-04-14 21:27:16,165] Trial 3 finished with value: 0.01035550806109333 and parameters: {'n_estimators': 705, 'max_depth': 8, 'learning_rate': 0.022352444872233174, 'subsample': 0.7579608112936713, 'colsample_bytree': 0.5059629117372366, 'reg_alpha': 0.5815255197371083, 'reg_lambda': 1.002548087898375}. Best is trial 3 with value: 0.01035550806109333.


In [None]:
# [I 2025-04-15 13:58:14,738] A new study created in memory with name: no-name-22524862-096e-4a9c-9127-9065231128c9
# [I 2025-04-15 17:24:36,091] Trial 38 finished with value: 0.010166797089021996 and parameters: {'n_estimators': 666, 'max_depth': 3, 'learning_rate': 0.040836959358773406, 'subsample': 0.6579759766023049, 'colsample_bytree': 0.5524209398644827, 'reg_alpha': 0.08762388495907977, 'reg_lambda': 0.1879069508548567}. Best is trial 38 with value: 0.010166797089021996.


In [None]:
study = optuna.create_study(direction='minimize')  # we want to minimize RMSLE
study.optimize(objective, n_trials=40)

[I 2025-04-15 13:58:14,738] A new study created in memory with name: no-name-22524862-096e-4a9c-9127-9065231128c9
[I 2025-04-15 14:02:42,656] Trial 0 finished with value: 0.011121550919978676 and parameters: {'n_estimators': 743, 'max_depth': 10, 'learning_rate': 0.12113035681058505, 'subsample': 0.6105988909213059, 'colsample_bytree': 0.6522140815463386, 'reg_alpha': 0.5023539756494094, 'reg_lambda': 1.0448421665048249}. Best is trial 0 with value: 0.011121550919978676.
[I 2025-04-15 14:07:57,474] Trial 1 finished with value: 0.011232933533692304 and parameters: {'n_estimators': 873, 'max_depth': 10, 'learning_rate': 0.09763370321487774, 'subsample': 0.794955761538382, 'colsample_bytree': 0.6859506519035343, 'reg_alpha': 0.20836206741687768, 'reg_lambda': 0.7584568687160074}. Best is trial 0 with value: 0.011121550919978676.
[I 2025-04-15 14:11:50,691] Trial 2 finished with value: 0.011471285174828922 and parameters: {'n_estimators': 904, 'max_depth': 5, 'learning_rate': 0.24264257571

In [None]:
print("Best RMSLE:", study.best_value)
print("Best parameters:", study.best_params)

NameError: name 'study' is not defined

In [None]:
best_params = study.best_params

best_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    **best_params
)
# best_model.fit(X_train, y_train)

In [None]:
base_models = [
    ('xgb', best_model),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0])),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42))
]

In [None]:
meta_model = RidgeCV(alphas=[0.1, 1.0, 10.0])

In [None]:
stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,  # optionally include original features
    cv=3,              # internal CV for base model training
    n_jobs=-1
)

In [None]:
# Step 1: SelectKBest — top 1000 statistically significant features
skb = SelectKBest(score_func=f_regression, k=1000)
X_kbest = skb.fit_transform(X_train, y_train)
kbest_features = X_train.columns[skb.get_support()]

# Step 2: XGBoost importance — top 300 from tree model
xgb = best_model
# xgb.fit(X_train, y_train)
xgb_top = X_train.columns[np.argsort(xgb.feature_importances_)[-100:]]

# Step 3: Intersection
selected_features = list(set(kbest_features).intersection(set(xgb_top)))
X_trimmed = X_train[selected_features]
X_test_trimmed = X_test[selected_features]

In [None]:
len(selected_features)

71

In [None]:
test_id = pd.read_csv('test.csv')['Id']

In [None]:
final_model = stack_model
final_model.fit(X_trimmed, y_train)

In [None]:
final_preds = final_model.predict(X_test_trimmed)

In [None]:
submission = pd.DataFrame({
    'Id': test_id,
    'SalePrice': final_preds})

In [None]:
submission['SalePrice'] = np.expm1(submission[['SalePrice']])

submission.to_csv("submission.csv", index=False)
print("Submission file created with XGBoost predictions!")

Submission file created with XGBoost predictions!
