In [1]:
# XGBoost Hyperparameter Tuning Notebook
# Filename: xgboost_hyperparameter_tuning.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, KFold
import pickle
import os

# Load the preprocessed data
X_train = pd.read_csv(r"../preprocessing/X_train.csv")
X_test = pd.read_csv(r"../preprocessing/X_test.csv")
y_train = pd.read_csv(r"../preprocessing/y_train.csv").squeeze()
y_test = pd.read_csv(r"../preprocessing/y_test.csv").squeeze()

# Ensure all features are numeric by dropping non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
X_train = X_train.drop(columns=non_numeric_columns)
X_test = X_test.drop(columns=non_numeric_columns)

# 1. Perform Advanced Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.5],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'scale_pos_weight': [1, 2, 3],  # Additional hyperparameter
    'max_delta_step': [0, 1, 5]     # Additional hyperparameter
}

# Initialize the XGBoost model
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Set up cross-validation with a large number of folds
kfold = KFold(n_splits=20, random_state=42, shuffle=True)

# Initialize RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgboost_model,
    param_distributions=param_grid,
    n_iter=200,  # Increased number of iterations
    scoring='neg_mean_squared_error',
    cv=30,  # Increased number of folds
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Get the best model from random search
best_xgboost_params = random_search.best_params_

# Refit the best model without using early stopping
best_xgboost_model = xgb.XGBRegressor(**best_xgboost_params, objective='reg:squarederror', random_state=42)
best_xgboost_model.fit(X_train, y_train)

# Save the model with tuning
model_with_tuning_path = r"../models/xgboost_model_with_tuning.pkl"
os.makedirs(os.path.dirname(model_with_tuning_path), exist_ok=True)

with open(model_with_tuning_path, 'wb') as file:
    pickle.dump(best_xgboost_model, file)

print(f"XGBoost model with hyperparameter tuning saved to {model_with_tuning_path}")

# Evaluate the tuned model
y_pred_with_tuning = best_xgboost_model.predict(X_test)
r2_with_tuning = r2_score(y_test, y_pred_with_tuning)
rmse_with_tuning = np.sqrt(mean_squared_error(y_test, y_pred_with_tuning))
mae_with_tuning = mean_absolute_error(y_test, y_pred_with_tuning)

print("\nEvaluation of XGBoost model with advanced tuning:")
print(f"R2 Score: {r2_with_tuning}")
print(f"RMSE: {rmse_with_tuning}")
print(f"MAE: {mae_with_tuning}")

Fitting 30 folds for each of 200 candidates, totalling 6000 fits
XGBoost model with hyperparameter tuning saved to ../models/xgboost_model_with_tuning.pkl

Evaluation of XGBoost model with advanced tuning:
R2 Score: 0.9973522639270446
RMSE: 0.4763265828015646
MAE: 0.28919944016635424


: 