In [1]:
import xgboost as xgb
print(xgb.__version__)

3.0.4


In [2]:
import sys, xgboost as xgb
print(sys.executable)        # should point to .../.venv/bin/python
print(xgb.__version__)       # should print 3.0.4
print(xgb.__file__)          # should live under .../.venv/...

e:\ML-projects\Regression_ML_EndtoEnd\.venv\Scripts\python.exe
3.0.4
e:\ML-projects\Regression_ML_EndtoEnd\.venv\Lib\site-packages\xgboost\__init__.py


In [3]:
# ==============================================
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ==============================================
# 2. Load processed datasets
# ==============================================
train_df = pd.read_csv("E:/ML-projects/Regression_ML_EndtoEnd/data/processed/feature_engineered_train.csv")
eval_df  = pd.read_csv("E:/ML-projects/Regression_ML_EndtoEnd/data/processed/feature_engineered_eval.csv")


# Define target + features
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval   = eval_df.drop(columns=[target]), eval_df[target]

print("Train shape:", X_train.shape)
print("Eval shape:", X_eval.shape)

Train shape: (585199, 41)
Eval shape: (149423, 41)


In [22]:

# Clean data for XGBoost inside Optuna
X_train_clean = X_train.select_dtypes(include=['int', 'float', 'bool']).copy().fillna(0)
X_eval_clean  = X_eval.select_dtypes(include=['int', 'float', 'bool']).copy().fillna(0)

# ensure alignment (if eval lacks some one-hot cols)
X_eval_clean = X_eval_clean.reindex(columns=X_train_clean.columns, fill_value=0)

# convert targets to numeric arrays
y_train_arr = pd.to_numeric(y_train, errors='coerce').fillna(0).values
y_eval_arr  = pd.to_numeric(y_eval,  errors='coerce').fillna(0).values

# convert features to numpy arrays (safe for XGBoost)
X_train_arr = X_train_clean.values.astype(float)
X_eval_arr  = X_eval_clean.values.astype(float)


In [21]:
def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train_arr, y_train_arr)

        y_pred = model.predict(X_eval_arr)
        rmse = float(np.sqrt(mean_squared_error(y_eval_arr, y_pred)))
        mae = float(mean_absolute_error(y_eval_arr, y_pred))
        r2 = float(r2_score(y_eval_arr, y_pred))

        # Log hyperparameters + metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse


In [17]:
# ==============================================
# 4. Run Optuna study with MLflow
# ==============================================
# Force MLflow to always use the root project mlruns folder
mlflow.set_tracking_uri("file:///E:/ML-projects/Regression_ML_EndtoEnd/mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)

[I 2025-12-04 18:55:05,360] A new study created in memory with name: no-name-ce8f4219-2c9b-4996-8148-7f41035779d1
[I 2025-12-04 18:55:59,777] Trial 0 finished with value: 71841.890161622 and parameters: {'n_estimators': 883, 'max_depth': 8, 'learning_rate': 0.17291816398468443, 'subsample': 0.8250132462170008, 'colsample_bytree': 0.7544216655956326, 'min_child_weight': 10, 'gamma': 0.5485407744160431, 'reg_alpha': 0.0010002124038078224, 'reg_lambda': 2.183273615473709e-08}. Best is trial 0 with value: 71841.890161622.
[I 2025-12-04 18:56:19,177] Trial 1 finished with value: 74230.75463324043 and parameters: {'n_estimators': 373, 'max_depth': 6, 'learning_rate': 0.06220731239227764, 'subsample': 0.9912734435193371, 'colsample_bytree': 0.8463635211126092, 'min_child_weight': 6, 'gamma': 2.4520268007858324, 'reg_alpha': 8.135632749590608e-07, 'reg_lambda': 5.048783650311177e-08}. Best is trial 0 with value: 71841.890161622.
[I 2025-12-04 18:57:33,647] Trial 2 finished with value: 80178.97

Best params: {'n_estimators': 972, 'max_depth': 8, 'learning_rate': 0.02493391530771888, 'subsample': 0.652047482111805, 'colsample_bytree': 0.7182346732160855, 'min_child_weight': 10, 'gamma': 0.6021729493091406, 'reg_alpha': 0.00036396077285295815, 'reg_lambda': 0.025420258726613153}


In [25]:
# ==============================================
# 5. Train final model with best params and log to MLflow
# ==============================================
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train_arr, y_train_arr)

y_pred = best_model.predict(X_eval_arr)

mae = mean_absolute_error(y_eval_arr, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval_arr, y_pred))
r2 = r2_score(y_eval_arr, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log final model
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, name="model")

Final tuned model performance:
MAE: 30997.338523443355
RMSE: 69906.80442135924
R²: 0.9621859198715181


  self.get_booster().save_model(fname)
