In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [33]:
df.shape

(1338, 7)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [35]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Data Preprocessing
1. Checking and handling missing values(if any).
2. Checking and handling missing duplicates(if any).
3. Encode the categorical columns.

In [36]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [37]:
df[df.duplicated() == "True"]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges


In [38]:
### 1. There are 0 null values & 0 duplicates.

In [39]:
cat_cols = df.select_dtypes(include = "object").columns

In [40]:
cat_cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [41]:
for col in cat_cols:
    print(df[col].value_counts())

sex
male      676
female    662
Name: count, dtype: int64
smoker
no     1064
yes     274
Name: count, dtype: int64
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64


In [42]:
df_ = df.copy() ## copying the original dataset if we need it for further use.

In [43]:
## Spliting the dataset
X = df.drop("charges", axis = 1)
y = df["charges"]

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ("oh_encoder", OneHotEncoder(drop="first"), cat_cols)
    ],
    remainder="passthrough"
)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [46]:
X_train

array([[ 1.   ,  0.   ,  1.   , ..., 24.   , 23.655,  0.   ],
       [ 0.   ,  0.   ,  0.   , ..., 28.   , 26.51 ,  2.   ],
       [ 1.   ,  0.   ,  0.   , ..., 51.   , 39.7  ,  1.   ],
       ...,
       [ 1.   ,  0.   ,  0.   , ..., 58.   , 25.175,  0.   ],
       [ 0.   ,  1.   ,  0.   , ..., 37.   , 47.6  ,  2.   ],
       [ 1.   ,  0.   ,  0.   , ..., 55.   , 29.9  ,  0.   ]])

In [47]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Training

In [51]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score

In [52]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "SVM" : SVR(),
    "KNN" : KNeighborsRegressor(), 
    "DecsionTree" : DecisionTreeRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "Adaboost" : AdaBoostRegressor(),
    "Gradient" : GradientBoostingRegressor(),
    "XGBoost" : XGBRegressor()
}
models

{'LinearRegression': LinearRegression(),
 'Lasso': Lasso(),
 'Ridge': Ridge(),
 'SVM': SVR(),
 'KNN': KNeighborsRegressor(),
 'DecsionTree': DecisionTreeRegressor(),
 'RandomForest': RandomForestRegressor(),
 'Adaboost': AdaBoostRegressor(),
 'Gradient': GradientBoostingRegressor(),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,


In [55]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print(f"{list(models.keys())[i]} training_accuracy : {r2_score(y_train, y_pred_train) * 100}")
    print(f"{list(models.keys())[i]} testing_accuracy : {r2_score(y_test, y_pred_test) * 100}")
    print("-" * 30)

LinearRegression training_accuracy : 74.49555328228537
LinearRegression testing_accuracy : 76.72642952734357
------------------------------
Lasso training_accuracy : 74.49554309406788
Lasso testing_accuracy : 76.72565500810316
------------------------------
Ridge training_accuracy : 74.4954767556988
Ridge testing_accuracy : 76.72261942743437
------------------------------
SVM training_accuracy : -9.648380891733588
SVM testing_accuracy : -9.31399027291584
------------------------------
KNN training_accuracy : 85.82927246379867
KNN testing_accuracy : 79.75685156773892
------------------------------
DecsionTree training_accuracy : 99.87411422200097
DecsionTree testing_accuracy : 73.45980693140999
------------------------------
RandomForest training_accuracy : 97.53769491867628
RandomForest testing_accuracy : 85.08193187115147
------------------------------
Adaboost training_accuracy : 79.39081158677257
Adaboost testing_accuracy : 77.55112060192255
------------------------------
Gradient t

## GradientBoost model preforms best with 86.1% accuracy.