In [76]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [77]:
# read csv
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


#### data analysis

In [78]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [80]:
df.isnull().sum() #No missing data

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [81]:
df["age"].value_counts()

age
18    69
19    68
50    29
51    29
47    29
46    29
45    29
20    29
48    29
52    29
22    28
49    28
54    28
53    28
21    28
26    28
24    28
25    28
28    28
27    28
23    28
43    27
29    27
30    27
41    27
42    27
44    27
31    27
40    27
32    26
33    26
56    26
34    26
55    26
57    26
37    25
59    25
58    25
36    25
38    25
35    25
39    25
61    23
60    23
63    23
62    23
64    22
Name: count, dtype: int64

In [82]:
df["sex"].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [83]:
df["bmi"].value_counts()

bmi
32.300    13
28.310     9
30.495     8
30.875     8
31.350     8
          ..
46.200     1
23.800     1
44.770     1
32.120     1
30.970     1
Name: count, Length: 548, dtype: int64

In [84]:
df["children"].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [85]:
df["smoker"].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [86]:
df["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

### preprocessing

In [87]:
sex_types = pd.get_dummies(df.sex,prefix="sex")
smoker_types = pd.get_dummies(df.smoker,prefix="smoker")
region_types = pd.get_dummies(df.region,prefix="region")

df = pd.concat([df,sex_types,smoker_types,region_types],axis=1)

In [88]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,southwest,16884.924,True,False,False,True,False,False,False,True
1,18,male,33.77,1,no,southeast,1725.5523,False,True,True,False,False,False,True,False
2,28,male,33.0,3,no,southeast,4449.462,False,True,True,False,False,False,True,False
3,33,male,22.705,0,no,northwest,21984.47061,False,True,True,False,False,True,False,False
4,32,male,28.88,0,no,northwest,3866.8552,False,True,True,False,False,True,False,False


In [89]:
df.drop(["sex","smoker","region","smoker_no","sex_female"],axis = 1 , inplace = True)

### Train Test Split

In [90]:
y = df["charges"]
df.drop(["charges"],axis = 1, inplace = True)
X = df

In [91]:
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state =42,
                                                      test_size = 0.25)

#### Standar Scaler

In [92]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Decison Tree Regressor

In [93]:
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)

# Evaluating the Performance of the Base Model
y_pred = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 43136043.26507364
R^2 Score: 0.7141247435102465


### Model Tuning

In [94]:
param_grid = {
    "max_depth": [3, 5,10,15,20],
    "min_samples_split": [2, 5, 10,15,20],
    "min_samples_leaf": [1, 2, 4,6,10,15]
}

grid_search = GridSearchCV(dt_regressor, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Params:", best_params)

# Creating and Training the Best Model
best_dt_regressor = DecisionTreeRegressor(random_state=42, **best_params)
best_dt_regressor.fit(X_train, y_train)

# Evaluating the Performance of the Best Model
y_pred_best = best_dt_regressor.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)
print("Best Model R^2 Score:", r2_best)

Best Params: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Model Mean Squared Error: 22054560.500134207
Best Model R^2 Score: 0.853837935459195


# Random Forest Regressor

In [95]:
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train,y_train)

# Evaluating the Performance of the Base Model
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 23656799.09614786
R^2 Score: 0.8432194286393065


### Model Tuning

In [105]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter optimization using GridSearchCV
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Params:", best_params)

# Creating and Training the Best Model
best_rf_regressor = RandomForestRegressor(random_state=42, **best_params).fit(X_train, y_train)


# Evaluating the Performance of the Best Model
y_pred_best = best_rf_regressor.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)
print("Best Model R^2 Score:", r2_best)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Params: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best Model Mean Squared Error: 20683936.6152792
Best Model R^2 Score: 0.8629214634088055


# Lasso

In [97]:
lasso_model = Lasso()
lasso_model.fit(X_train,y_train)

# Evaluating the Performance of the Base Model
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 35062579.00836874
R^2 Score: 0.7676299676904882


### Model Tuning

In [98]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 2, 5, 10]
}

# Parameter optimization using GridSearchCV
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print("Best Params:", best_params)

# Creating and Training the Best Model
best_lasso = Lasso(random_state=42, **best_params)
best_lasso.fit(X_train, y_train)

# Evaluating the Performance of the Best Model
y_pred_best = best_lasso.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)
print("Best Model R^2 Score:", r2_best)


Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Params: {'alpha': 10}
Best Model Mean Squared Error: 35076653.07768448
Best Model R^2 Score: 0.7675366946902094


# Ridge

In [99]:
ridge_model = Ridge()
ridge_model.fit(X_train,y_train)

# Evaluating the Performance of the Base Model
y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 35073113.120478414
R^2 Score: 0.7675601550286559


In [100]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 2, 5, 10]
}

# Parameter optimization using GridSearchCV
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print("Best Params:", best_params)

# Creating and Training the Best Model
best_ridge = Lasso(random_state=42, **best_params)
best_ridge.fit(X_train, y_train)

# Evaluating the Performance of the Best Model
y_pred_best = best_ridge.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)
print("Best Model R^2 Score:", r2_best)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best Params: {'alpha': 2}
Best Model Mean Squared Error: 35064101.09465813
Best Model R^2 Score: 0.7676198803765983


# ElasticNet

In [112]:
elastic_model = ElasticNet()
elastic_model.fit(X_train,y_train)

# Evaluating the Performance of the Base Model
y_pred = elastic_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)  

Mean Squared Error: 49048073.13425189
R^2 Score: 0.6749439812683173


### Model Tuning

In [113]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 2, 5, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# Parameter optimization using GridSearchCV
grid_search = GridSearchCV(estimator=elastic_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print("Best Params:", best_params)

# Creating and Training the Best Model
best_elastic = ElasticNet(random_state=42, **best_params)
best_elastic.fit(X_train, y_train)

# Evaluating the Performance of the Best Model
y_pred_best = best_elastic.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("Best Model Mean Squared Error:", mse_best)
print("Best Model R^2 Score:", r2_best)

Fitting 5 folds for each of 35 candidates, totalling 175 fits
Best Params: {'alpha': 0.01, 'l1_ratio': 0.7}
Best Model Mean Squared Error: 35084283.452717274
Best Model R^2 Score: 0.7674861259487462


# Model Selection

In [114]:
scores = {
    'ridge': 0.7676198803765983,
    'lasso': 0.7675366946902094,
    'rf': 0.8629214634088055,
    'dt': 0.853837935459195,
    "EN" : 0.7674861259487462
}

# Sort the dictionary by R-squared scores
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Finding the highest R-squared score and corresponding model
highest_score_model, highest_score = sorted_scores[0]

print("Highest R² score:", highest_score)
print("Highest R² score model:", highest_score_model)

Highest R² score: 0.8629214634088055
Highest R² score model: rf


In [111]:
predicts = {
    "age" : 20,
    "bmi" : 26.5,
    "children" : 2,
    "sex_male" : 0,
    "smoker_yes" : 1,
    "region_northeast" : 1,
    "region_northwest" : 0,
    "region_southeast" : 0,
    "region_southwest" : 0
    
}

# Converting to DataFrame format
predict_data = pd.DataFrame([predicts])

# Predict
predict_charges = best_rf_regressor.predict(predict_data)
print("Predict Charges: ", predict_charges[0])

Predict Charges:  48531.173146011766
