In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [17]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

In [None]:
X = df_LE.drop('claim', axis=1)

y = df_LE['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model directly (no pipeline needed)
rf = RandomForestRegressor(random_state=42, n_jobs=1)  # Enable parallel tree training

# Expanded parameter grid with corrected max_features values
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1]  
}

mape_scorer = make_scorer(abs(mean_absolute_percentage_error), greater_is_better=False)  # Minimize error

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,       # Directly use the RandomForestRegressor
    param_distributions=param_dist,
    n_iter=200,         # Number of parameter settings sampled
    cv=5,              # 5-fold cross-validation
    n_jobs=18,         # Use all available cores for parallel search
    random_state=42,   # For reproducibility
    verbose=3,         # Show intermediate progress
    scoring=mape_scorer,
)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Predictions on the training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to

# Predictions on the test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to percentage

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print("Train set R² score:", r2_train)
print(f"Train set MAPE: {mape_train:.2f}%")
print("Test set R² score:", r2_test)
print(f"Test set MAPE: {mape_test:.2f}%")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': None, 'max_depth': 30, 'ccp_alpha': 0.0, 'bootstrap': False}
Best cross-validation score: -0.036222122385397654
Train set R² score: 1.0
Train set MAPE: 0.00%
Test set R² score: 0.9556124553263627
Test set MAPE: 3.05%


In [19]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
results.head(10)

Unnamed: 0,params,mean_test_score,std_test_score
130,"{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': None, 'max_depth': 30, 'ccp_alpha': 0.0, 'bootstrap': False}",-0.036222,0.002755
67,"{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 1.0, 'max_depth': 50, 'ccp_alpha': 0.1, 'bootstrap': False}",-0.036372,0.002713
8,"{'n_estimators': 100, 'min_samples_split': 8, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 0.7, 'max_depth': 40, 'ccp_alpha': 0.01, 'bootstrap': False}",-0.036404,0.002429
31,"{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': None, 'max_depth': None, 'ccp_alpha': 0.01, 'bootstrap': False}",-0.03646,0.002638
111,"{'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 1.0, 'max_depth': 20, 'ccp_alpha': 0.001, 'bootstrap': False}",-0.036617,0.002601
163,"{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 'log2', 'max_depth': None, 'ccp_alpha': 0.1, 'bootstrap': False}",-0.037666,0.001694
94,"{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 'sqrt', 'max_depth': 35, 'ccp_alpha': 0.0, 'bootstrap': False}",-0.037889,0.001804
4,"{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 'log2', 'max_depth': 45, 'ccp_alpha': 0.0, 'bootstrap': False}",-0.037894,0.002182
101,"{'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 'log2', 'max_depth': None, 'ccp_alpha': 0.001, 'bootstrap': False}",-0.038625,0.001823
138,"{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_features': 'sqrt', 'max_depth': 50, 'ccp_alpha': 0.0, 'bootstrap': False}",-0.038928,0.001546


In [20]:
X = df_OHE.drop('claim', axis=1)

y = df_OHE['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model directly (no pipeline needed)
rf = RandomForestRegressor(random_state=42, n_jobs=1)  # Enable parallel tree training

# Expanded parameter grid with corrected max_features values
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1]  
}

mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)  # Minimize error

# Set up the RandomizedSearchCV
random_search2 = RandomizedSearchCV(
    estimator=rf,       # Directly use the RandomForestRegressor
    param_distributions=param_dist,
    n_iter=200,         # Number of parameter settings sampled
    cv=5,              # 5-fold cross-validation
    n_jobs=18,         # Use all available cores for parallel search
    random_state=42,   # For reproducibility
    verbose=3,         # Show intermediate progress
    scoring=mape_scorer,
)

# Fit RandomizedSearchCV on the training data
random_search2.fit(X_train, y_train)

# Predictions on the training set
y_train_pred2 = random_search2.best_estimator_.predict(X_train)
r2_train2 = r2_score(y_train, y_train_pred2)
mape_train2 = mean_absolute_percentage_error(y_train, y_train_pred2) * 100  

# Predictions on the test set
y_test_pred2 = random_search2.best_estimator_.predict(X_test)
r2_test2 = r2_score(y_test, y_test_pred2)
mape_test2 = mean_absolute_percentage_error(y_test, y_test_pred2) * 100  # Convert to percentage

# Print results
print("Best parameters found:", random_search2.best_params_)
print("Best cross-validation score:", random_search2.best_score_)
print("Train set R² score:", r2_train2)
print(f"Train set MAPE: {mape_train2:.2f}%")
print("Test set R² score:", r2_test2)
print(f"Test set MAPE: {mape_test2:.2f}%")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 0.5, 'max_depth': None, 'ccp_alpha': 0.0, 'bootstrap': False}
Best cross-validation score: -0.036635515396364074
Train set R² score: 1.0
Train set MAPE: 0.00%
Test set R² score: 0.9699415897119722
Test set MAPE: 3.52%


In [21]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search2.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
print(results.head(10))

                                                                                                                                                                                 params  mean_test_score  std_test_score
123    {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 0.5, 'max_depth': None, 'ccp_alpha': 0.0, 'bootstrap': False}        -0.036636        0.001211
124    {'n_estimators': 200, 'min_samples_split': 6, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_features': 0.7, 'max_depth': 25, 'ccp_alpha': 0.01, 'bootstrap': False}        -0.042270        0.004697
14    {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.01, 'max_features': 1.0, 'max_depth': 45, 'ccp_alpha': 0.001, 'bootstrap': False}        -0.043596        0.002551
115      {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.1, 'max_features': 0.5, 'ma

## Key Takeaways

1. OHE does not really improve accuracy, LE seems sufficient for tree-based models.

2. Number of estimators does not seem to big of an importance, therefore smaller models should work nearly as fine.