In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')
df_LE = pd.read_csv('../DataSet/RegressionData/healthinsurance_LE.csv')

In [3]:
X = df_LE.drop('claim', axis=1)

y = df_LE['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model directly (no pipeline needed)
rf = RandomForestRegressor(random_state=42, n_jobs=1)  # Enable parallel tree training

# Expanded parameter grid with corrected max_features values
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1]  
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,       # Directly use the RandomForestRegressor
    param_distributions=param_dist,
    n_iter=200,         # Number of parameter settings sampled
    cv=5,              # 5-fold cross-validation
    n_jobs=18,         # Use all available cores for parallel search
    random_state=42,   # For reproducibility
    verbose=3,         # Show intermediate progress
    scoring='r2',
)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Predictions on the training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)

# Predictions on the test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)
print("Train set R² score:", r2_train)
print("Test set R² score:", r2_test)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0, 'max_features': 'log2', 'max_depth': 50, 'ccp_alpha': 0.01, 'bootstrap': False}
Best cross-validation score: 0.9742939956277962
Train set R² score: 0.9999999999427673
Test set R² score: 0.9729410804235603


In [11]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
results.head(10)

Unnamed: 0,params,mean_test_score,std_test_score
40,"{'n_estimators': 700, 'min_samples_split': 2, ...",0.974294,0.004437
3,"{'n_estimators': 400, 'min_samples_split': 2, ...",0.974251,0.004838
55,"{'n_estimators': 500, 'min_samples_split': 4, ...",0.974123,0.004462
191,"{'n_estimators': 100, 'min_samples_split': 6, ...",0.973465,0.004356
49,"{'n_estimators': 300, 'min_samples_split': 10,...",0.97249,0.004324
39,"{'n_estimators': 200, 'min_samples_split': 2, ...",0.972424,0.004644
16,"{'n_estimators': 300, 'min_samples_split': 4, ...",0.972184,0.004832
41,"{'n_estimators': 600, 'min_samples_split': 6, ...",0.972165,0.004774
69,"{'n_estimators': 700, 'min_samples_split': 10,...",0.972117,0.004605
4,"{'n_estimators': 500, 'min_samples_split': 2, ...",0.97196,0.004805


In [4]:
X = df_OHE.drop('claim', axis=1)

y = df_OHE['claim']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model directly (no pipeline needed)
rf = RandomForestRegressor(random_state=42, n_jobs=1)  # Enable parallel tree training

# Expanded parameter grid with corrected max_features values
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700],
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None, 0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1]  
}

# Set up the RandomizedSearchCV
random_search2 = RandomizedSearchCV(
    estimator=rf,       # Directly use the RandomForestRegressor
    param_distributions=param_dist,
    n_iter=200,         # Number of parameter settings sampled
    cv=5,              # 5-fold cross-validation
    n_jobs=18,         # Use all available cores for parallel search
    random_state=42,   # For reproducibility
    verbose=3,         # Show intermediate progress
    scoring='r2',
)

# Fit RandomizedSearchCV on the training data
random_search2.fit(X_train, y_train)

# Predictions on the training set
y_train_pred = random_search2.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)

# Predictions on the test set
y_test_pred = random_search2.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("Best parameters found:", random_search2.best_params_)
print("Best cross-validation score:", random_search2.best_score_)
print("Train set R² score:", r2_train)
print("Test set R² score:", r2_test)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best parameters found: {'n_estimators': 500, 'min_samples_split': 4, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1, 'max_features': 'sqrt', 'max_depth': None, 'ccp_alpha': 0.01, 'bootstrap': False}
Best cross-validation score: 0.9737331449864554
Train set R² score: 0.9989716551684622
Test set R² score: 0.9709558441218078


In [12]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search2.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
results.head(10)

Unnamed: 0,params,mean_test_score,std_test_score
55,"{'n_estimators': 500, 'min_samples_split': 4, ...",0.973733,0.004515
40,"{'n_estimators': 700, 'min_samples_split': 2, ...",0.970207,0.004252
162,"{'n_estimators': 300, 'min_samples_split': 6, ...",0.970033,0.00349
159,"{'n_estimators': 200, 'min_samples_split': 6, ...",0.970004,0.00362
167,"{'n_estimators': 200, 'min_samples_split': 2, ...",0.969914,0.003619
133,"{'n_estimators': 100, 'min_samples_split': 2, ...",0.969904,0.00379
165,"{'n_estimators': 100, 'min_samples_split': 8, ...",0.969467,0.003519
149,"{'n_estimators': 600, 'min_samples_split': 8, ...",0.969375,0.003618
10,"{'n_estimators': 100, 'min_samples_split': 10,...",0.968871,0.004252
110,"{'n_estimators': 300, 'min_samples_split': 4, ...",0.968383,0.003693


## Key Takeaways

1. OHE does not really improve accuracy, LE seems sufficient for tree-based models.

2. Number of estimators does not seem to big of an importance, therefore smaller models should work nearly as fine.