In [1]:
import xgboost as xgb
import pandas as pd 
print(xgb.__version__)

3.1.1


In [2]:
train_df = pd.read_csv(r"C:\Users\hites\OneDrive\Desktop\Housing Regression\data\processed\feature_eng_train_df.csv")
eval_df = pd.read_csv(r"C:\Users\hites\OneDrive\Desktop\Housing Regression\data\processed\feature_eng_eval_df.csv")

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval   = eval_df.drop(columns=[target]), eval_df[target]

print("Train shape:", X_train.shape)
print("Eval shape:", X_eval.shape)

Train shape: (576815, 40)
Eval shape: (149423, 40)


In [6]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run(nested = True):
        model = XGBRegressor(**params) 
        model.fit(X_train, y_train)

        y_pred = model.predict(X_eval)
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
        mae = float(mean_absolute_error(y_eval, y_pred))
        r2 = float(r2_score(y_eval, y_pred))

        # Log Hyperparameters + metrics 
        mlflow.log_params(params) 
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse

In [8]:
mlflow.set_tracking_uri("/mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials = 15)

print(f"Best Params: {study.best_trial.params}")

[I 2025-11-16 17:36:42,575] A new study created in memory with name: no-name-8636bfbe-bf11-4d08-9651-9ac2a372922a
[I 2025-11-16 17:37:04,804] Trial 0 finished with value: 100785.88956685315 and parameters: {'n_estimators': 423, 'max_depth': 8, 'learning_rate': 0.2428896031770554, 'subsample': 0.618186831187229, 'colsample_bytree': 0.6872307603928465, 'min_child_weight': 7, 'gamma': 1.1499686176254242, 'reg_alpha': 0.5962816265206449, 'reg_lambda': 0.00011946650557241467}. Best is trial 0 with value: 100785.88956685315.
[I 2025-11-16 17:37:21,559] Trial 1 finished with value: 99818.42396035172 and parameters: {'n_estimators': 731, 'max_depth': 4, 'learning_rate': 0.017116089554106394, 'subsample': 0.7833336466182129, 'colsample_bytree': 0.713699948090093, 'min_child_weight': 10, 'gamma': 1.8692119197581658, 'reg_alpha': 1.8625898136312494e-08, 'reg_lambda': 7.781484448889172}. Best is trial 1 with value: 99818.42396035172.
[I 2025-11-16 17:37:34,873] Trial 2 finished with value: 95496.4

Best Params: {'n_estimators': 650, 'max_depth': 10, 'learning_rate': 0.010186140363475859, 'subsample': 0.5108055538588381, 'colsample_bytree': 0.5573118164958445, 'min_child_weight': 8, 'gamma': 4.866563929970948, 'reg_alpha': 2.998728066283549e-05, 'reg_lambda': 1.6712896989394253e-06}


In [10]:
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params) 
best_model.fit(X_train, y_train) 

y_pred = best_model.predict(X_eval)

mae = mean_absolute_error(y_eval, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
r2 = r2_score(y_eval, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Logging final Model 
with mlflow.start_run(run_name = "Best_XGBoost_Model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, name="model")

Final tuned model performance:
MAE: 46707.542252267405
RMSE: 87429.96808980509
R²: 0.9408526516191364


