In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [14]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

In [15]:
X = df_LE.drop('claim', axis=1)

y = df_LE['claim']


# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

# Define hyperparameter grid for XGBoost
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400],  # Number of boosting rounds
    'max_depth': [3, 5, 7, 10, 15],  # Tree depth (smaller than RF to prevent overfitting)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size for boosting
    'subsample': [0.6, 0.8, 1.0],  # Fraction of data used per boosting round
    'colsample_bytree': [0.5, 0.7, 1.0],  # Fraction of features used per tree
    'gamma': [0, 0.01, 0.1, 0.2, 0.5],  # Minimum loss reduction to split
    'reg_alpha': [0, 0.01, 0.1, 1.0],  # L1 Regularization (Lasso)
    'reg_lambda': [1, 2, 5, 10],  # L2 Regularization (Ridge)
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=200,  # Number of parameter combinations to try
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,  # Show intermediate progress
    scoring='r2',  # R² as the scoring metric
)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train, y_train)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print("Train set R² score:", r2_train)
print(f"Train set MAPE: {mape_train:.2f}%")
print("Test set R² score:", r2_test)
print(f"Test set MAPE: {mape_test:.2f}%")


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.7}
Best cross-validation score: 0.9751970291137695
Train set R² score: 0.9999998807907104
Train set MAPE: 0.03%
Test set R² score: 0.9734224081039429
Test set MAPE: 3.52%


In [16]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
results.head(10)

Unnamed: 0,params,mean_test_score,std_test_score
48,"{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.7}",0.975197,0.004022
160,"{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.7}",0.975184,0.00403
70,"{'subsample': 0.8, 'reg_lambda': 10, 'reg_alpha': 0.01, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.7}",0.974788,0.004406
141,"{'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.7}",0.974761,0.00395
179,"{'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.7}",0.974757,0.003956
42,"{'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0.01, 'n_estimators': 300, 'max_depth': 15, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.7}",0.974733,0.004451
166,"{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.7}",0.97429,0.004695
124,"{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 1.0, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.7}",0.97406,0.004316
83,"{'subsample': 0.6, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}",0.973716,0.004229
158,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 300, 'max_depth': 15, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.7}",0.973619,0.003774
