In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error
from joblib import parallel_backend

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [2]:
df = pd.read_csv('../DataSet/healthinsurance.csv')


In [3]:
# Define categorical features
categorical_features = ['sex', 'hereditary_diseases', 'smoker', 'city', 'diabetes', 'regular_ex', 'job_title']

# Define features & target
X = df.drop('claim', axis=1)
y = df['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define CatBoost model
catboost = CatBoostRegressor(
    loss_function='RMSE', 
    random_state=42, 
    verbose=0,  # No logs during training
    thread_count=-1
)

param_dist = {
    'iterations': [50, 100, 200, 300, 400],  
    'depth': [4, 6, 8, 10],  
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  
    'l2_leaf_reg': [1, 3, 5, 10],  
    'border_count': [32, 64, 128],  
    'bagging_temperature': [0, 0.5, 1, 2],  
    'subsample': [0.6, 0.8, 1.0],  
    'colsample_bylevel': [0.5, 0.7, 1.0],  
    'boosting_type': ['Plain'],  
    'grow_policy': ['Depthwise', 'SymmetricTree'],  
}

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)  # Minimize error

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=catboost,
    param_distributions=param_dist,
    n_iter=200,  # Number of parameter combinations to try
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,  # Show intermediate progress
    scoring=mape_scorer,
    error_score='raise'
)

# Use the threading backend to potentially allow verbose output to print
with parallel_backend('threading'):
    random_search.fit(X_train, y_train, cat_features=categorical_features)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print(f"Train set R² score: {r2_train:.4f}")
print(f"Train set MAPE: {mape_train:.2f}%")
print(f"Test set R² score: {r2_test:.4f}")
print(f"Test set MAPE: {mape_test:.2f}%")



Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 5/5] END bagging_temperature=0, boosting_type=Plain, border_count=64, colsample_bylevel=0.5, depth=6, grow_policy=Depthwise, iterations=50, l2_leaf_reg=5, learning_rate=0.01, subsample=0.8;, score=-1.160 total time=   2.8s
[CV 2/5] END bagging_temperature=0, boosting_type=Plain, border_count=64, colsample_bylevel=0.5, depth=6, grow_policy=Depthwise, iterations=50, l2_leaf_reg=5, learning_rate=0.01, subsample=0.8;, score=-1.041 total time=   3.8s
[CV 4/5] END bagging_temperature=0, boosting_type=Plain, border_count=64, colsample_bylevel=0.5, depth=6, grow_policy=Depthwise, iterations=50, l2_leaf_reg=5, learning_rate=0.01, subsample=0.8;, score=-1.032 total time=   4.1s
[CV 3/5] END bagging_temperature=0, boosting_type=Plain, border_count=64, colsample_bylevel=0.5, depth=6, grow_policy=Depthwise, iterations=50, l2_leaf_reg=5, learning_rate=0.01, subsample=0.8;, score=-1.009 total time=   4.2s
[CV 1/5] END bagging_tempera

In [4]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
print(results.head(10))

                                                                                                                                                                                                                             params  mean_test_score  std_test_score
137       {'subsample': 1.0, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 400, 'grow_policy': 'Depthwise', 'depth': 10, 'colsample_bylevel': 1.0, 'border_count': 128, 'boosting_type': 'Plain', 'bagging_temperature': 2}        -0.056507        0.004399
195        {'subsample': 1.0, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 400, 'grow_policy': 'Depthwise', 'depth': 10, 'colsample_bylevel': 0.5, 'border_count': 64, 'boosting_type': 'Plain', 'bagging_temperature': 0}        -0.057442        0.005404
80         {'subsample': 0.6, 'learning_rate': 0.2, 'l2_leaf_reg': 1, 'iterations': 400, 'grow_policy': 'Depthwise', 'depth': 10, 'colsample_bylevel': 0.7, 'border_count': 32, 'boosting_type': 'Plain', 'bagging_temper