In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error
# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [5]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

In [6]:
# Define features & target
X = df_LE.drop('claim', axis=1)
y = df_LE['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define LightGBM model
lgbm = LGBMRegressor(objective='regression', random_state=42, n_jobs=-1, verbose=-1)

# Define hyperparameter grid for LightGBM
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400],  # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size for boosting
    'max_depth': [-1, 5, 10, 15],  # Tree depth (-1 = no limit, best left unrestricted)
    'num_leaves': [20, 31, 50, 100, 150],  # More leaves = more complexity
    'min_child_samples': [5, 10, 20, 50],  # Minimum samples per leaf (prevents overfitting)
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples per boosting round
    'colsample_bytree': [0.5, 0.7, 1.0],  # Fraction of features per tree
    'reg_alpha': [0, 0.01, 0.1, 1.0],  # L1 Regularization (Lasso)
    'reg_lambda': [1, 2, 5, 10],  # L2 Regularization (Ridge)
    'min_split_gain': [0, 0.01, 0.1, 0.2],  # Minimum gain required to split
    'feature_fraction': [0.6, 0.8, 1.0],  # Controls feature sampling per iteration
    'bagging_fraction': [0.6, 0.8, 1.0],  # Controls data subsampling per iteration
    'bagging_freq': [0, 1, 5],  # Frequency of bagging (0 = no bagging)
}

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)  # Minimize error

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=200,  # Number of parameter combinations to try
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,  # Show intermediate progress
    scoring=mape_scorer,
)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train, y_train)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print(f"Train set R² score: {r2_train:.4f}")
print(f"Train set MAPE: {mape_train:.2f}%")
print(f"Test set R² score: {r2_test:.4f}")
print(f"Test set MAPE: {mape_test:.2f}%")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'subsample': 0.8, 'reg_lambda': 5, 'reg_alpha': 1.0, 'num_leaves': 150, 'n_estimators': 400, 'min_split_gain': 0, 'min_child_samples': 5, 'max_depth': 15, 'learning_rate': 0.1, 'feature_fraction': 1.0, 'colsample_bytree': 1.0, 'bagging_freq': 0, 'bagging_fraction': 0.6}
Best cross-validation score: -0.04296344178139329
Train set R² score: 1.0000
Train set MAPE: 0.51%
Test set R² score: 0.9677
Test set MAPE: 4.25%


In [7]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
results.head(10)

Unnamed: 0,params,mean_test_score,std_test_score
55,"{'subsample': 0.8, 'reg_lambda': 5, 'reg_alpha': 1.0, 'num_leaves': 150, 'n_estimators': 400, 'min_split_gain': 0, 'min_child_samples': 5, 'max_depth': 15, 'learning_rate': 0.1, 'feature_fraction': 1.0, 'colsample_bytree': 1.0, 'bagging_freq': 0, 'bagging_fraction': 0.6}",-0.042963,0.002929
76,"{'subsample': 0.6, 'reg_lambda': 10, 'reg_alpha': 1.0, 'num_leaves': 150, 'n_estimators': 300, 'min_split_gain': 0, 'min_child_samples': 5, 'max_depth': -1, 'learning_rate': 0.2, 'feature_fraction': 0.8, 'colsample_bytree': 0.7, 'bagging_freq': 0, 'bagging_fraction': 1.0}",-0.043857,0.001542
57,"{'subsample': 0.8, 'reg_lambda': 5, 'reg_alpha': 0, 'num_leaves': 100, 'n_estimators': 400, 'min_split_gain': 0.01, 'min_child_samples': 5, 'max_depth': 10, 'learning_rate': 0.2, 'feature_fraction': 0.8, 'colsample_bytree': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}",-0.045911,0.002098
164,"{'subsample': 0.6, 'reg_lambda': 5, 'reg_alpha': 0, 'num_leaves': 150, 'n_estimators': 400, 'min_split_gain': 0, 'min_child_samples': 10, 'max_depth': -1, 'learning_rate': 0.1, 'feature_fraction': 1.0, 'colsample_bytree': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}",-0.046118,0.002704
125,"{'subsample': 0.8, 'reg_lambda': 10, 'reg_alpha': 0.01, 'num_leaves': 150, 'n_estimators': 400, 'min_split_gain': 0.1, 'min_child_samples': 5, 'max_depth': -1, 'learning_rate': 0.2, 'feature_fraction': 1.0, 'colsample_bytree': 0.7, 'bagging_freq': 5, 'bagging_fraction': 0.8}",-0.047413,0.003087
166,"{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'num_leaves': 100, 'n_estimators': 400, 'min_split_gain': 0, 'min_child_samples': 5, 'max_depth': -1, 'learning_rate': 0.1, 'feature_fraction': 0.6, 'colsample_bytree': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}",-0.049513,0.001638
33,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.01, 'num_leaves': 150, 'n_estimators': 300, 'min_split_gain': 0.1, 'min_child_samples': 10, 'max_depth': -1, 'learning_rate': 0.05, 'feature_fraction': 1.0, 'colsample_bytree': 0.5, 'bagging_freq': 1, 'bagging_fraction': 1.0}",-0.053291,0.003162
87,"{'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0.01, 'num_leaves': 100, 'n_estimators': 100, 'min_split_gain': 0.1, 'min_child_samples': 5, 'max_depth': -1, 'learning_rate': 0.2, 'feature_fraction': 1.0, 'colsample_bytree': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}",-0.054462,0.003229
183,"{'subsample': 0.6, 'reg_lambda': 5, 'reg_alpha': 0, 'num_leaves': 50, 'n_estimators': 300, 'min_split_gain': 0.01, 'min_child_samples': 5, 'max_depth': 15, 'learning_rate': 0.2, 'feature_fraction': 0.8, 'colsample_bytree': 0.7, 'bagging_freq': 5, 'bagging_fraction': 1.0}",-0.055073,0.002419
28,"{'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.1, 'num_leaves': 100, 'n_estimators': 300, 'min_split_gain': 0, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'colsample_bytree': 1.0, 'bagging_freq': 0, 'bagging_fraction': 0.8}",-0.055473,0.001773
