In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
# As the above dataset contains sex and smoker columns as a categorical value 
# We need to convert that into numerical value using below one hot encoding method.
dataset=pd.get_dummies(dataset,drop_first=True) # Removing the repeated value column

In [5]:
dataset # Once the categorical columns converted into numerical column values

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']] # separating the input values
dep=dataset['charges'] # separating the output values

In [7]:
# Consolidating the playable parameters probability and also considering the cross validation
# Considering different portions of a data set as a test and traint set
from sklearn.model_selection import GridSearchCV 
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['mse','mae','friedman_mse'],
'max_features': ['auto','sqrt','log2'],
'splitter':['best','random']} # Giving all possible HTPs
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
# refit = True means consider the best result model as a result model
# refit = False means consider the last combination model as a result model
# fitting the model for grid search
grid.fit(indep, dep) # Passing whole dataset input and output variable

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             verbose=3)

In [8]:
# print best parameter after tuning
print(grid.best_params_)
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

{'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'}
The R_score value for best parameter {'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'}:


In [9]:
re


{'mean_fit_time': array([0.59655838, 0.06474934, 0.05782366, 0.05010753, 0.04945016,
        0.2232389 , 0.17648354, 0.12910399, 0.08861856, 0.10378251,
        0.09483309, 0.12499661, 0.00937667, 0.00937548, 0.0194418 ,
        0.00479584, 0.00937638, 0.01001058]),
 'std_fit_time': array([8.92114949e-01, 2.23049663e-02, 8.40151852e-03, 7.06210375e-04,
        1.48430240e-03, 3.12128813e-01, 4.97199305e-02, 3.67601366e-02,
        2.77225148e-02, 7.50030263e-02, 1.52952970e-02, 1.25385293e-01,
        7.65601844e-03, 7.65504498e-03, 7.35655973e-03, 6.00825181e-03,
        7.65578457e-03, 8.25524895e-03]),
 'mean_score_time': array([0.00860682, 0.00865026, 0.00299864, 0.00619683, 0.00586662,
        0.00314069, 0.00586214, 0.00693674, 0.01593981, 0.00937471,
        0.01787338, 0.00937519, 0.00625057, 0.00937428, 0.00759587,
        0.01458173, 0.00625124, 0.0074512 ]),
 'std_score_time': array([0.00600465, 0.00610311, 0.00244838, 0.00097741, 0.00306656,
        0.00410312, 0.00587955, 

In [10]:
table=pd.DataFrame.from_dict(re) # converting the dictionary format into readable format

In [11]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.596558,0.892115,0.008607,0.006005,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",0.726843,0.611288,0.729247,0.729167,0.670979,0.693505,0.04675,7
1,0.064749,0.022305,0.00865,0.006103,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",0.692857,0.667965,0.742507,0.705941,0.665882,0.69503,0.028142,6
2,0.057824,0.008402,0.002999,0.002448,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.689479,0.536126,0.732611,0.630609,0.694386,0.656642,0.068522,12
3,0.050108,0.000706,0.006197,0.000977,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.641425,0.571571,0.749514,0.642886,0.618269,0.644733,0.058389,15
4,0.04945,0.001484,0.005867,0.003067,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",0.802628,0.638024,0.732854,0.642703,0.716899,0.706622,0.061323,2
5,0.223239,0.312129,0.003141,0.004103,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",0.719896,0.544745,0.662609,0.72491,0.68135,0.666702,0.065303,10
6,0.176484,0.04972,0.005862,0.00588,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",0.725953,0.56524,0.717025,0.723436,0.752898,0.696911,0.066968,5
7,0.129104,0.03676,0.006937,0.011538,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",0.655678,0.627556,0.735365,0.733465,0.732625,0.696938,0.046045,4
8,0.088619,0.027723,0.01594,0.017127,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.725227,0.444973,0.710124,0.726384,0.670398,0.655421,0.107158,13
9,0.103783,0.075003,0.009375,0.012499,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.671624,0.514189,0.700399,0.668121,0.616739,0.634214,0.065789,18


In [12]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:33
BMI:1
Children:3
Sex Male 0 or 1:1
Smoker Yes 0 or 1:0


In [13]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[5253.524]


  "X does not have valid feature names, but"
