In [1]:
# import pandas library

import pandas as pd

In [2]:
# load dataset

dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
# Categorical input data
# Nominal - one hot encoding

dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
#split input data

independent=dataset[['age', 'bmi', 'children',  'sex_male','smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
# split output data

dependent=dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [6]:
# Split training and testing data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [7]:
# Standardization

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [10]:
# GridSearchCV Model Creation - Decision Tree

# criterion{“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”squared_error”
# splitter{“best”, “random”}, default=”best”
# max_featuresint, float or {“auto”, “sqrt”, “log2”}, default=None

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid= { 'criterion':['squared_error', 'friedman_mse', 'absolute_error','poisson'],
               'splitter':['best','random'],
            'max_features':['auto', 'sqrt','log2']}

grid=GridSearchCV(DecisionTreeRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)

grid.fit(x_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [14]:
#Print hyper tuning parameter and r2_score

result=grid.cv_results_

# Evaluation Metrics

y_pred=grid.predict(x_test)

# r2_score

from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
print("The r2_score with hyper tuning parameter {}".format(grid.best_params_),r_score)

The r2_score with hyper tuning parameter {'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'best'} 0.6959287354824849


In [16]:
# view in table format

table=pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.07245,0.028151,0.007864,0.003956,squared_error,auto,best,"{'criterion': 'squared_error', 'max_features':...",0.743249,0.538869,0.761895,0.644926,0.679458,0.673679,0.079569,2
1,0.022384,0.004077,0.009396,0.002496,squared_error,auto,random,"{'criterion': 'squared_error', 'max_features':...",0.689811,0.574291,0.715379,0.637421,0.610433,0.645467,0.051405,7
2,0.018188,0.003708,0.007795,0.000979,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.724092,0.566608,0.74257,0.460592,0.609691,0.62071,0.104138,14
3,0.016989,0.002279,0.009594,0.002869,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.686899,0.588631,0.634375,0.417167,0.625865,0.590587,0.092213,19
4,0.020187,0.002992,0.008595,0.0012,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.714473,0.53693,0.676795,0.566908,0.685623,0.636146,0.070531,9
5,0.016589,0.002497,0.01659,0.008471,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.750755,0.632041,0.595905,0.446955,0.59706,0.604543,0.097055,18
6,0.037176,0.00676,0.009595,0.002726,friedman_mse,auto,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.751913,0.562764,0.784073,0.621522,0.668368,0.677728,0.08158,1
7,0.029182,0.003865,0.019988,0.009731,friedman_mse,auto,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.56036,0.600597,0.56899,0.709646,0.657323,0.619383,0.056503,15
8,0.023784,0.00172,0.012594,0.002937,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.764627,0.601906,0.67655,0.622076,0.650915,0.663215,0.056683,3
9,0.022783,0.003428,0.012394,0.001854,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.721174,0.596159,0.630603,0.447648,0.555454,0.590208,0.089834,20


In [17]:
#Output

age=float(input("Age :"))
bmi=float(input("BMI :"))
children=int(input("Children :"))
sex_male=int(input("Sex_male 1 or 0 :"))
smoker_yes=int(input("Smoker 1 0r 0 :"))

Age :12
BMI :23
Children :3
Sex_male 1 or 0 :1
Smoker 1 0r 0 :1


In [19]:
#Output Prediction

Prediction=grid.predict([[age,bmi,children,sex_male,smoker_yes]])
print("Future Prediction {}".format(Prediction))

Future Prediction [63770.42801]
