In [1]:
# Data Collection
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")
dataset


Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [2]:
# Data preprocessing - nominal to numerical data - one hot encoding method
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
indep=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dep=dataset[['charges']]
dep

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [5]:
# split train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(indep,dep,test_size=0.3, random_state=0)
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,0,0
196,39,32.800,0,0,0
438,52,46.750,5,0,0
183,44,26.410,0,0,0
1298,33,27.455,2,1,0
...,...,...,...,...,...
763,27,26.030,0,1,0
835,42,35.970,2,1,0
1216,40,25.080,0,1,0
559,19,35.530,0,1,0


In [6]:
# Standardisation-Pre processing method - rescales data to a consistent format-easier to understand and use
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_test

array([[ 0.89459283, -0.08863026, -0.06746417,  1.02378711, -0.50466988],
       [ 0.53757957, -0.22180837, -0.06746417, -0.97676557, -0.50466988],
       [ 0.60898222,  1.57449152,  0.76341038,  1.02378711,  1.98149332],
       ...,
       [ 1.10880078,  1.20785059, -0.89833872,  1.02378711, -0.50466988],
       [ 1.75142463,  1.34905148, -0.06746417,  1.02378711, -0.50466988],
       [ 1.60861933, -0.92299913, -0.89833872, -0.97676557, -0.50466988]])

In [7]:
# model creation 
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid={
    'criterion':["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'max_features':['sqrt','log2',None],
    'splitter':['best','random']
}
grid=GridSearchCV(DecisionTreeRegressor(),param_grid, refit=True, verbose=3,n_jobs=-1)
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [8]:
#print the best parameter after tunning
grid.best_params_

{'criterion': 'absolute_error', 'max_features': None, 'splitter': 'random'}

In [9]:
y_pred=grid.predict(X_test)
y_pred

array([ 9957.7216  ,  8930.93455 , 47269.854   , 12574.049   ,
        9264.797   , 11737.84884 ,  1615.7667  , 10560.4917  ,
        5934.3798  ,  5253.524   ,  7228.21565 , 10156.7832  ,
        7639.41745 ,  4571.41305 , 18246.4955  , 10600.5483  ,
       12142.5786  ,  3292.52985 ,  6455.86265 , 33307.5508  ,
       24869.8368  , 12646.207   ,  9625.92    , 37829.7242  ,
        1826.843   ,  4076.497   ,  3558.62025 ,  7418.522   ,
        3757.8448  ,  8027.968   ,  8252.2843  , 48673.5588  ,
       12981.3457  , 20781.48892 , 13747.87235 ,  3554.203   ,
        8733.22925 , 44585.45587 , 39125.33225 ,  1880.07    ,
        5266.3656  ,  2866.091   , 38245.59327 , 48673.5588  ,
       35585.576   ,  3579.8287  , 10600.5483  ,  6849.026   ,
        4719.52405 , 11830.6072  , 13126.67745 ,  4234.927   ,
       37829.7242  , 46661.4424  , 11856.4115  ,  2689.4954  ,
        2897.3235  ,  8835.26495 ,  7441.501   , 14119.62    ,
        1639.5631  , 47291.055   , 36910.60803 , 25900.

In [10]:
from sklearn.metrics import r2_score
r2=r2_score(Y_test,y_pred)
print(f"R2 score = {r2:.2f}")

R2 score = 0.71


In [11]:
re=grid.cv_results_
table=pd.DataFrame.from_dict(re)


In [12]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.012897,0.000544,0.0006,0.0012,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.675284,0.628823,0.561522,0.659261,0.578677,0.620714,0.044273,10
1,0.008013,0.003566,0.000565,0.001131,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.587236,0.564767,0.614312,0.483879,0.707751,0.591589,0.072595,19
2,0.007572,0.004086,0.000476,0.000952,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.722721,0.493811,0.624633,0.600974,0.628433,0.614114,0.073153,14
3,0.006018,0.004847,0.005707,0.00488,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.662777,0.678976,0.531214,0.568507,0.639157,0.616126,0.056806,13
4,0.009409,0.004958,0.003331,0.006661,squared_error,,best,"{'criterion': 'squared_error', 'max_features':...",0.717373,0.557054,0.790186,0.641832,0.659762,0.673241,0.077835,4
5,0.01258,0.006843,0.002724,0.003904,squared_error,,random,"{'criterion': 'squared_error', 'max_features':...",0.730887,0.639647,0.671911,0.674393,0.653806,0.674129,0.031076,3
6,0.012412,0.005177,0.003124,0.003863,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.71664,0.574998,0.650326,0.607925,0.547141,0.619406,0.05956,12
7,0.008496,0.003722,0.002024,0.004049,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.589865,0.48103,0.472613,0.457531,0.638768,0.527962,0.072576,24
8,0.008576,0.004405,0.00455,0.004591,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.670451,0.631281,0.66663,0.663506,0.612233,0.64882,0.023009,6
9,0.006051,0.004941,0.004035,0.004941,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.648213,0.555115,0.507743,0.595598,0.573687,0.576071,0.046259,20
