In [1]:
# Importing necessary libraries
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import joblib

# Load the preprocessed data
X_train, X_test, y_train, y_test = joblib.load('data/split_data.pkl')

# Define the XGBoost Regressor model
model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.05],  # Learning rate
    'max_depth': [3, 4, 5],  # Maximum depth of a tree
    'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 0.9, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred)
r2_best = r2_score(y_test, y_pred)

print(f"Best XGBoost Regressor - Mean Squared Error: {mse_best}")
print(f"Best XGBoost Regressor - R² Score: {r2_best}")
print(f"Best Parameters: {grid_search.best_params_}")

# Save the best model
joblib.dump(best_model, 'models/xgboost_model.pkl')


  


Best XGBoost Regressor - Mean Squared Error: 1.5439113995033784
Best XGBoost Regressor - R² Score: 0.9257550239562988
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}


['models/xgboost_model.pkl']