In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('cleaned_data.csv')


In [9]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import shap

# Define a function to train the XGBoost model with hyperparameter tuning
def train_xgboost_with_metrics(X_train, y_train, X_test, y_test):
    # Initialize the XGBoost regressor
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

    # Define a hyperparameter grid for tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    # Set up GridSearchCV with 5-fold cross-validation
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

    # Fit the model to the training data
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_model = grid_search.best_estimator_

    # Make predictions on the test set
    y_pred_xgb = best_model.predict(X_test)

    # Calculate evaluation metrics
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
    mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
    mape_xgb = np.mean(np.abs((y_test - y_pred_xgb) / y_test)) * 100

    # Print best parameters and evaluation metrics
    print("Best Parameters:", grid_search.best_params_)
    print("RMSE - XGBoost:", rmse_xgb)
    print("MAE - XGBoost:", mae_xgb)
    print("MAPE - XGBoost:", mape_xgb)
    
    # Hyperparameter Sensitivity Analysis
    results = grid_search.cv_results_
    for param in param_grid.keys():
        plt.figure(figsize=(10, 6))
        plt.plot(results[f'param_{param}'], results['mean_test_score'])
        plt.xlabel(param)
        plt.ylabel('Mean test score')
        plt.title(f'Hyperparameter Sensitivity: {param}')
        plt.show()

    # Feature Importance
    feature_importance = best_model.feature_importances_
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    plt.figure(figsize=(12, 6))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.columns[sorted_idx])
    plt.xlabel('Feature Importance')
    plt.title('XGBoost Feature Importance')
    plt.tight_layout()
    plt.show()

    # SHAP Values for Feature Importance
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test)

    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
    plt.title('SHAP Feature Importance')
    plt.tight_layout()
    plt.show()

    # Return the best model and metrics
    return best_model, rmse_xgb, mae_xgb, mape_xgb

# Example usage
# Standardize the features as before
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Train the XGBoost model with hyperparameter tuning and get metrics
# best_xgb_model, rmse_xgb, mae_xgb, mape_xgb = train_xgboost_with_metrics(X_train_scaled, y_train, X_test_scaled, y_test)
