In [1]:
import pandas as pd 
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv("/Users/larry/house-price-regression/data/processed/train_engineered.csv")
eval_df = pd.read_csv("/Users/larry/house-price-regression/data/processed/eval_engineered.csv")

In [3]:
target = "price"
X_train = train_df.drop(columns=[target])
X_eval = eval_df.drop(columns=[target])
y_train = train_df[target]
y_eval = eval_df[target]

In [4]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Objective function for Optuna hyperparameter optimization.
    This function trains an XGBoost regressor with the given hyperparameters, evaluates it on the evaluation set, and logs the results to MLflow.
    The function returns the RMSE of the model on the evaluation set, which Optuna will try to minimize.
    """
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log = True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "tree_method": "hist",
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "n_jobs": -1
    }
    
    with mlflow.start_run(nested = True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_eval)
        mae = float(mean_absolute_error(y_eval, y_pred))
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
        r2 = float(r2_score(y_eval, y_pred))

        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    return rmse

In [5]:
# Optuna + MLFlow

mlflow.set_tracking_uri("/Users/larry/house-price-regression/mlflow")
mlflow.set_experiment("house_price_regression_optuna")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=35)

print("Best trial:" )
print(study.best_trial.params)

  return FileStore(store_uri, store_uri)
[32m[I 2026-02-23 11:08:14,362][0m A new study created in memory with name: no-name-4367feb7-2228-479c-9120-0c169f7cec91[0m
[32m[I 2026-02-23 11:08:34,533][0m Trial 0 finished with value: 73757.60777200977 and parameters: {'n_estimators': 740, 'max_depth': 6, 'learning_rate': 0.20459074245190145, 'subsample': 0.8238494047737244, 'colsample_bytree': 0.735624737201323, 'gamma': 4.223504865271629, 'reg_alpha': 0.9062554116000382, 'reg_lambda': 4.802159656028052, 'min_child_weight': 3.0729343097663557}. Best is trial 0 with value: 73757.60777200977.[0m
[32m[I 2026-02-23 11:08:40,653][0m Trial 1 finished with value: 77244.1782994062 and parameters: {'n_estimators': 183, 'max_depth': 6, 'learning_rate': 0.28205068457398075, 'subsample': 0.5888613553796408, 'colsample_bytree': 0.600570750398381, 'gamma': 2.0101056981562153, 'reg_alpha': 0.8369558035125818, 'reg_lambda': 2.021061125526104, 'min_child_weight': 1.8294782667681333}. Best is trial 0

Best trial:
{'n_estimators': 801, 'max_depth': 8, 'learning_rate': 0.04569655125726923, 'subsample': 0.7031153968891845, 'colsample_bytree': 0.5653890699255735, 'gamma': 0.12706891296390732, 'reg_alpha': 3.5186934638884626, 'reg_lambda': 2.373326753596462, 'min_child_weight': 4.090265045915563}


In [7]:
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_eval)

mae = mean_absolute_error(y_eval, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
r2 = r2_score(y_eval, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)


Final tuned model performance:
MAE: 31089.364801483374
RMSE: 69037.79421181201
R²: 0.9631672754470617
