In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv(r"D:\ml-aws\data\processed\feature_engineered_train.csv")
eval_df  = pd.read_csv(r"D:\ml-aws\data\processed\feature_engineered_train.csv")
# Define target + features
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval   = eval_df.drop(columns=[target]), eval_df[target]

print("Train shape:", X_train.shape)
print("Eval shape:", X_eval.shape)

Train shape: (576815, 39)
Eval shape: (576815, 39)


In [3]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_eval)
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
        mae = float(mean_absolute_error(y_eval, y_pred))
        r2 = float(r2_score(y_eval, y_pred))

        # Log hyperparameters + metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse

In [4]:
# ==============================================
# 4. Run Optuna study with MLflow
# ==============================================
# Force MLflow to always use the root project mlruns folder
mlflow.set_tracking_uri("/Users/riadanas/Desktop/housing regression MLE/mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)

2026/02/11 22:59:13 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_optuna_housing' does not exist. Creating a new experiment.
[32m[I 2026-02-11 22:59:13,488][0m A new study created in memory with name: no-name-cbb6290f-7c7e-4fa6-bb29-fd6b68177e7f[0m
[32m[I 2026-02-11 22:59:26,716][0m Trial 0 finished with value: 25267.539198348313 and parameters: {'n_estimators': 649, 'max_depth': 7, 'learning_rate': 0.03761524119751351, 'subsample': 0.6077277680351454, 'colsample_bytree': 0.6410314119260474, 'min_child_weight': 6, 'gamma': 3.879336166136092, 'reg_alpha': 0.8890852056921319, 'reg_lambda': 7.017384352647589e-05}. Best is trial 0 with value: 25267.539198348313.[0m
[32m[I 2026-02-11 22:59:39,604][0m Trial 1 finished with value: 31603.06281218251 and parameters: {'n_estimators': 597, 'max_depth': 8, 'learning_rate': 0.011585240305677827, 'subsample': 0.5271411323103765, 'colsample_bytree': 0.8205900210010453, 'min_child_weight': 3, 'gamma': 0.12266441025370278, 'reg_alp

Best params: {'n_estimators': 990, 'max_depth': 10, 'learning_rate': 0.2758088570741083, 'subsample': 0.9757618279101405, 'colsample_bytree': 0.8127077267068469, 'min_child_weight': 3, 'gamma': 1.120235440414238, 'reg_alpha': 0.00011966162219280389, 'reg_lambda': 9.508681661394189e-07}
