In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [5]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

In [6]:
X = df_LE.drop('claim', axis=1)

y = df_LE['claim']


# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

param_dist = {
    'n_estimators': [300, 400, 500, 700],  # More trees for better generalization
    'max_depth': [7, 10, 12],  # Reduce from 15 to limit overfitting
    'learning_rate': [0.05, 0.1, 0.15],  # Slightly lower than 0.2 for better stability
    'subsample': [0.8, 0.9, 1.0],  # Encourage slight randomness to improve robustness
    'colsample_bytree': [0.7, 0.8, 0.9],  # Avoid using all features for each tree
    'gamma': [0.1, 0.2, 0.3],  # Encourage meaningful splits by increasing gamma
    'reg_alpha': [0.05, 0.1, 0.5],  # Moderate L1 regularization for sparsity
    'reg_lambda': [2, 5, 7],  # Stronger L2 regularization for better generalization
    'tree_method': ['hist'],  # Use histogram-based algorithm for faster training
}


mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)  # Minimize error

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=1000,  # Number of parameter combinations to try
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,  # Show intermediate progress
    scoring=mape_scorer,
)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train, y_train)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print("Train set R² score:", r2_train)
print(f"Train set MAPE: {mape_train:.2f}%")
print("Test set R² score:", r2_test)
print(f"Test set MAPE: {mape_test:.2f}%")


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Best parameters found: {'tree_method': 'hist', 'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 700, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}
Best cross-validation score: -0.03244293927187586
Train set R² score: 1.0
Train set MAPE: 0.01%
Test set R² score: 0.9726386070251465
Test set MAPE: 3.27%


In [7]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
print(results.head(10))

                                                                                                                                                                                params  mean_test_score  std_test_score
521   {'tree_method': 'hist', 'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 700, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}        -0.032443        0.001769
77   {'tree_method': 'hist', 'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.05, 'n_estimators': 400, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.9}        -0.032598        0.001809
212   {'tree_method': 'hist', 'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 400, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.9}        -0.032606        0.001756
727   {'tree_method': 'hist', 'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.5, 'n_estimators': 400, 'max_depth': 12, 'learning_rate'