In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_path="insurance.csv"

In [3]:
df=pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df["smoker"]=df["smoker"].map({"yes" : 1 , "no" : 0})

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


In [9]:
df.shape

(1338, 7)

In [10]:
for col in df.columns:
    print(df[col].value_counts())

age
18    69
19    68
50    29
51    29
47    29
46    29
45    29
20    29
48    29
52    29
22    28
49    28
54    28
53    28
21    28
26    28
24    28
25    28
28    28
27    28
23    28
43    27
29    27
30    27
41    27
42    27
44    27
31    27
40    27
32    26
33    26
56    26
34    26
55    26
57    26
37    25
59    25
58    25
36    25
38    25
35    25
39    25
61    23
60    23
63    23
62    23
64    22
Name: count, dtype: int64
sex
male      676
female    662
Name: count, dtype: int64
bmi
32.300    13
28.310     9
30.495     8
30.875     8
31.350     8
          ..
46.200     1
23.800     1
44.770     1
32.120     1
30.970     1
Name: count, Length: 548, dtype: int64
children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64
smoker
0    1064
1     274
Name: count, dtype: int64
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64
charges
1639.56310     2
16884.92400    1
29330.98315    1
2

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X=df.drop("charges" , axis=1)
y=df["charges"]

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [14]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [15]:
X_train["sex"] = X_train["sex"].map({"male":1, "female":0})
X_test["sex"]  = X_test["sex"].map({"male":1, "female":0})

In [16]:
encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")

X_train_region = encoder.fit_transform(X_train[["region"]])
X_test_region  = encoder.transform(X_test[["region"]])

region_cols = encoder.get_feature_names_out(["region"])

X_train_region = pd.DataFrame(X_train_region, columns=region_cols, index=X_train.index)
X_test_region  = pd.DataFrame(X_test_region,  columns=region_cols, index=X_test.index)

X_train = pd.concat([X_train.drop("region", axis=1), X_train_region], axis=1)
X_test  = pd.concat([X_test.drop("region", axis=1),  X_test_region], axis=1)

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [18]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [19]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [20]:
mae=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
r2_scr=r2_score(y_test,y_pred)
print("mean absolute error :",mae)
print("mean squared error :",mse)
print("r2_score  :",r2_scr)

mean absolute error : 2669.699558937313
mean squared error : 36371143.4697191
r2_score  : 0.7589577258086093


In [21]:
param = {
    "criterion" : ["squared_error","friedman_mse","absolute_error","poisson"],
    "splitter": ["best","random"],
    "max_depth": [1,2,3,4,5,42,None],
    "max_features" : ["sqrt","log2",None]
}

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
grid=GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=param,cv=5)

In [31]:
grid.fit(X_train_scaled,y_train)

In [32]:
y_pred=grid.predict(X_test_scaled)

In [35]:
mae=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
r2_scr=r2_score(y_test,y_pred)
print("mean absolute error :",mae)
print("mean squared error :",mse)
print("r2_score  :",r2_scr)

mean absolute error : 2114.2170311014925
mean squared error : 22687317.456798293
r2_score  : 0.8496444688454297
