In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score , mean_absolute_error , mean_squared_error
from sklearn.linear_model import LinearRegression
import pandas as pd
import os

In [2]:
print(os.getcwd())
df = pd.read_csv("../data/medical_insurance.csv")

/home/parnian/IMT/parnian_lali/src


**Splitting dataset :**

In [3]:
# split data to features and target
X = df.drop(['expenses'], axis=1)
Y = df['expenses']

In [4]:
# split data to train and test in 80% and 20% proportion
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

**Encoding the categorical features :**

In [5]:
# encode data with OneHotEncoder
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_columns])

Xtrain_encoded = encoder.transform(X_train[categorical_columns])
Xtest_encoded = encoder.transform(X_test[categorical_columns])

encoded_columns = encoder.get_feature_names_out(categorical_columns)
Xtrain_encoded_df = pd.DataFrame(Xtrain_encoded, columns=encoded_columns, index=X_train.index)
Xtest_encoded_df = pd.DataFrame(Xtest_encoded, columns=encoded_columns, index=X_test.index)

X_train_encoded = X_train.drop(columns=categorical_columns).join(Xtrain_encoded_df)
X_test_encoded = X_test.drop(columns=categorical_columns).join(Xtest_encoded_df)

X_train_encoded.head()

Unnamed: 0,age,bmi,children,premium,gender_female,gender_male,discount_eligibility_no,discount_eligibility_yes,region_northeast,region_northwest,region_southeast,region_southwest
560,46,20.0,2,183.8768,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1285,47,24.3,0,170.6934,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1142,52,24.9,0,542.3598,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
969,39,34.3,5,171.9366,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
486,54,21.5,3,249.507,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


**Feature selection :**

In [6]:
# features with a variance lower than 0.05 will be removed
selector = VarianceThreshold(
    threshold=0.05
)

Xtrain_selected = selector.fit_transform(X_train_encoded)
Xtest_selected = selector.transform(X_test_encoded)

feature_names = X_train_encoded.columns
selected_feature_names = feature_names[selector.get_support()]

X_train_selected = pd.DataFrame(Xtrain_selected, columns=selected_feature_names, index=X_train_encoded.index)
X_test_selected = pd.DataFrame(Xtest_selected, columns=selected_feature_names, index=X_test_encoded.index)

**Now we are ready to train our data, we analys our data and preprcess it. <br>**


*RandomForestRegressor :*

In [7]:
# train model (RandomForestReg)
model_1 = RandomForestRegressor()
model_1.fit(X_train_selected, Y_train)
Y_1 = model_1.predict(X_test_selected)

# evaluate the model
print(f'r2_score: {r2_score(Y_test, Y_1)}')
print(f'mae: {mean_absolute_error(Y_test, Y_1)}')
print(f'mse: {mean_squared_error(Y_test, Y_1)}')
print(f"rmse: {mean_squared_error(Y_test, Y_1) ** 0.5}")

r2_score: 0.9942841298767067
mae: 197.4652992537306
mse: 887381.5987893515
rmse: 942.0093411369929


**Hyperparameter tuning :**

In [8]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [9]:
rf = RandomForestRegressor(random_state=42)
grid_search_1 = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)


In [10]:
grid_search_1.fit(X_train_selected, Y_train)
best_model = grid_search_1.best_estimator_


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [11]:
Y_1_tuned = best_model.predict(X_test_selected)

# evaluate the model
print(f'r2_score: {r2_score(Y_test, Y_1_tuned)}')
print(f'mae: {mean_absolute_error(Y_test, Y_1_tuned)}')
print(f'mse: {mean_squared_error(Y_test, Y_1_tuned)}')
print(f"rmse: {mean_squared_error(Y_test, Y_1_tuned) ** 0.5}")

r2_score: 0.9935937648354657
mae: 194.46307733824375
mse: 994559.8972514189
rmse: 997.2762391892323


*LinearRegression :*

In [12]:
model_2 = LinearRegression()
model_2.fit(X_train_selected, Y_train)
Y_2 = model_2.predict(X_test_selected)

# evaluate the model
print(f'r2_score: {r2_score(Y_test, Y_2)}')
print(f'mae: {mean_absolute_error(Y_test, Y_2)}')
print(f'mse: {mean_squared_error(Y_test, Y_2)}')
print(f"rmse: {mean_squared_error(Y_test, Y_2) ** 0.5}")

r2_score: 0.870840659805395
mae: 3137.32560815503
mse: 20051824.01422882
rmse: 4477.926307369162


*AdaBoostRegressor :*

In [13]:
model_3 = AdaBoostRegressor()
model_3.fit(X_train_selected, Y_train)
Y_3 = model_3.predict(X_test_selected)

# evaluate the model
print(f'r2_score: {r2_score(Y_test, Y_3)}')
print(f'mae: {mean_absolute_error(Y_test, Y_3)}')
print(f'mse: {mean_squared_error(Y_test, Y_3)}')
print(f"rmse: {mean_squared_error(Y_test, Y_3) ** 0.5}")

r2_score: 0.9332734269410219
mae: 2716.280446343287
mse: 10359215.973349333
rmse: 3218.5735929677503


*Hyperparameter tuning :*

In [14]:
param_grid_2 = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

In [15]:
ada = AdaBoostRegressor(random_state=42)

grid_search_2 = GridSearchCV(
    estimator=ada,
    param_grid=param_grid_2,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)
grid_search_2.fit(X_train_selected, Y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [16]:
best_ada = grid_search_2.best_estimator_
Y_3_tuned = best_ada.predict(X_test_selected)

# evaluate the model
print(f'r2_score: {r2_score(Y_test, Y_3_tuned)}')
print(f'mae: {mean_absolute_error(Y_test, Y_3_tuned)}')
print(f'mse: {mean_squared_error(Y_test, Y_3_tuned)}')
print(f"rmse: {mean_squared_error(Y_test, Y_3_tuned) ** 0.5}")


r2_score: 0.9568988461146923
mae: 1780.32626320872
mse: 6691399.5628671115
rmse: 2586.773968259908


**Final analys**

RandomForestRegressor without Hyperparameter tuning with an R² ≈ 0.994 and RMSE ≈ 942 was the best model.

*Tuning Gains:*

Helped a lot in AdaBoostRegressor.

 but Barely helped (and even slightly hurt) RandomForestRegressor.