
## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset over here

In [2]:
data=pd.read_csv("insurance.csv")

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Taking care of duplicate observations over here

In [4]:
data.duplicated().sum()

1

In [5]:
data.drop_duplicates(inplace=True)

In [6]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [7]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

## Filtering all the numerical features over here

In [8]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

age
bmi
children
charges


In [9]:
data[numerical_features]

Unnamed: 0,age,bmi,children,charges
0,19,27.900,0,16884.92400
1,18,33.770,1,1725.55230
2,28,33.000,3,4449.46200
3,33,22.705,0,21984.47061
4,32,28.880,0,3866.85520
...,...,...,...,...
1333,50,30.970,3,10600.54830
1334,18,31.920,0,2205.98080
1335,18,36.850,0,1629.83350
1336,21,25.800,0,2007.94500


## Filtering all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

sex
smoker
region


In [11]:
data[cat_features]

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest
...,...,...,...
1333,male,no,northwest
1334,female,no,northeast
1335,female,no,southeast
1336,female,no,southwest


## Encoding all the categorical features into numerical features over here

In [12]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [13]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,0,0,16884.92400
1,18,1,33.770,1,1,1,1725.55230
2,28,1,33.000,3,1,1,4449.46200
3,33,1,22.705,0,1,2,21984.47061
4,32,1,28.880,0,1,2,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,1,2,10600.54830
1334,18,0,31.920,0,1,3,2205.98080
1335,18,0,36.850,0,1,1,1629.83350
1336,21,0,25.800,0,1,0,2007.94500


## Creating the features and labels over here

In [14]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem over fitting over here

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [22]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [23]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
regressor=CatBoostRegressor()
regressor.fit(X_train,y_train)

Learning rate set to 0.041377
0:	learn: 11516.4390214	total: 47.4ms	remaining: 47.4s
1:	learn: 11168.3337559	total: 48.7ms	remaining: 24.3s
2:	learn: 10835.8451195	total: 49.4ms	remaining: 16.4s
3:	learn: 10525.4985402	total: 50.4ms	remaining: 12.6s
4:	learn: 10205.7136950	total: 51.5ms	remaining: 10.2s
5:	learn: 9902.8680314	total: 52.5ms	remaining: 8.69s
6:	learn: 9625.7411266	total: 53.5ms	remaining: 7.59s
7:	learn: 9360.3968876	total: 54.5ms	remaining: 6.75s
8:	learn: 9104.2478448	total: 55.4ms	remaining: 6.1s
9:	learn: 8869.3796070	total: 56.4ms	remaining: 5.59s
10:	learn: 8647.2014755	total: 57ms	remaining: 5.13s
11:	learn: 8419.6069117	total: 58ms	remaining: 4.77s
12:	learn: 8228.7712686	total: 59ms	remaining: 4.48s
13:	learn: 8022.4736981	total: 59.5ms	remaining: 4.19s
14:	learn: 7838.4562728	total: 60.5ms	remaining: 3.97s
15:	learn: 7655.5902203	total: 61.5ms	remaining: 3.78s
16:	learn: 7464.8390410	total: 62.4ms	remaining: 3.61s
17:	learn: 7304.1192427	total: 63ms	remaining: 

<catboost.core.CatBoostRegressor at 0x7febd89a61d0>

## Evaluating the performance of the model on the testing dataset over here

In [24]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 2455.45  1633.96]
 [10172.18  8547.69]
 [ 9593.71  9290.14]
 [30336.92 32548.34]
 [ 8876.82  9644.25]
 [ 5986.08  2680.95]
 [ 2539.14  2198.19]
 [ 1143.34  1241.57]
 [ 3422.53  2710.83]
 [12761.17 12235.84]
 [ 9931.88  8280.62]
 [16968.4  17043.34]
 [14749.29 13974.46]
 [ 9831.73  8219.2 ]
 [ 6203.18  5472.45]
 [ 3045.2   2438.06]
 [ 3895.17  5267.82]
 [ 8272.08  3490.55]
 [ 6126.95  6640.54]
 [14587.85 14692.67]
 [ 2896.51  1622.19]
 [13971.6  13224.69]
 [ 2795.6   1256.3 ]
 [ 4816.23  2643.27]
 [ 4060.77  1674.63]
 [ 7583.34  4667.61]
 [ 4191.48  3732.63]
 [11755.83 11552.9 ]
 [ 4113.07  3756.62]
 [39428.31 37465.34]
 [ 7530.    8059.68]
 [47012.17 47462.89]
 [10626.04 10577.09]
 [11138.1  20630.28]
 [17029.32 14571.89]
 [14606.19 36580.28]
 [ 9409.93  8347.16]
 [37524.74 51194.56]
 [ 8616.61  8428.07]
 [ 3414.31  1880.49]
 [31393.19 33475.82]
 [ 3872.51  2867.12]
 [11591.49  4564.19]
 [48021.46 47496.49]
 [36187.8  36149.48]
 [ 8842.11  8125.78]
 [11418.74 19749.38]
 [ 7306.66  7

## Checking the metrics over here for predicting the continuos numerical values it means for regression problem statement over here

In [25]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8461016555908627

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Mean squared error:", mse)
print("Mean absolute error:", mae)
print("Root mean squared error:", rmse)


Mean squared error: 25890251.837512877
Mean absolute error: 2736.6141389358254
Root mean squared error: 5088.246440328227


## Printing the most accurate predictions made by the model over here

In [30]:
data_diff = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data_diff['Difference'] = data_diff['Actual'] - data_diff['Predicted']
data_diff['Absolute Difference'] = abs(data_diff['Difference'])
data_diff_filtered = data_diff[data_diff['Absolute Difference'] < 100]
data_diff_filtered

Unnamed: 0,Actual,Predicted,Difference,Absolute Difference
7,1241.565,1143.335884,98.229116,98.229116
11,17043.3414,16968.40407,74.93733,74.93733
32,10577.087,10626.037893,-48.950893,48.950893
44,36149.4835,36187.801015,-38.317515,38.317515
74,13012.20865,13060.879265,-48.670615,48.670615
82,4571.41305,4604.368719,-32.955669,32.955669
204,39871.7043,39943.518714,-71.814414,71.814414
207,6875.961,6908.591399,-32.630399,32.630399
208,6600.361,6570.569573,29.791427,29.791427
214,6282.235,6345.340489,-63.105489,63.105489
