In [9]:
import pandas as pd 
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

In [10]:
train_df = pd.read_csv("/Users/larry/house-price-regression/data/processed/train_engineered.csv")
eval_df = pd.read_csv("/Users/larry/house-price-regression/data/processed/eval_engineered.csv")

In [11]:
target = "price"
X_train = train_df.drop(columns=[target])
X_eval = eval_df.drop(columns=[target])
y_train = train_df[target]
y_eval = eval_df[target]

In [14]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Objective function for Optuna hyperparameter optimization.
    This function trains an XGBoost regressor with the given hyperparameters, evaluates it on the evaluation set, and logs the results to MLflow.
    The function returns the RMSE of the model on the evaluation set, which Optuna will try to minimize.
    """
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log = True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "tree_method": "hist",
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "n_jobs": -1
    }
    
    with mlflow.start_run(nested = True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_eval)
        mae = float(mean_absolute_error(y_eval, y_pred))
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
        r2 = float(r2_score(y_eval, y_pred))

        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    return rmse

In [15]:
# Optuna + MLFlow

mlflow.set_tracking_uri("/Users/larry/house-price-regression/mlflow")
mlflow.set_experiment("house_price_regression_optuna")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("Best trial:" )
print(study.best_trial.params)

[32m[I 2026-02-20 17:14:35,131][0m A new study created in memory with name: no-name-d08212ed-256d-4c63-98f4-945cdf38d911[0m


[32m[I 2026-02-20 17:14:41,254][0m Trial 0 finished with value: 78564.48250906603 and parameters: {'n_estimators': 667, 'max_depth': 3, 'learning_rate': 0.0683316447167705, 'subsample': 0.8688142194261168, 'colsample_bytree': 0.7523465171617614, 'gamma': 0.3436537884562979, 'reg_alpha': 3.372357974408855, 'reg_lambda': 3.7548112717036197, 'min_child_weight': 4.858028849395967}. Best is trial 0 with value: 78564.48250906603.[0m
[32m[I 2026-02-20 17:14:44,864][0m Trial 1 finished with value: 76183.64237239368 and parameters: {'n_estimators': 330, 'max_depth': 4, 'learning_rate': 0.2448525663016186, 'subsample': 0.8195960460302432, 'colsample_bytree': 0.7409289505049553, 'gamma': 3.2289121510274903, 'reg_alpha': 3.016756831448149, 'reg_lambda': 0.9676052433228804, 'min_child_weight': 6.9223171236559295}. Best is trial 1 with value: 76183.64237239368.[0m
[32m[I 2026-02-20 17:14:49,380][0m Trial 2 finished with value: 78236.3986717249 and parameters: {'n_estimators': 360, 'max_depth

Best trial:
{'n_estimators': 949, 'max_depth': 8, 'learning_rate': 0.030075676971411756, 'subsample': 0.6320498350557924, 'colsample_bytree': 0.5885424397486171, 'gamma': 1.1525462688143004, 'reg_alpha': 4.055589717170902, 'reg_lambda': 4.002950778667765, 'min_child_weight': 3.513791554991248}


In [18]:
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_eval)

mae = mean_absolute_error(y_eval, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
r2 = r2_score(y_eval, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)


Final tuned model performance:
MAE: 31024.984533672705
RMSE: 68972.73283283725
R²: 0.9632366652268888
