In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
# As the above dataset contains sex and smoker columns as a categorical value 
# We need to convert that into numerical value using below one hot encoding method.
dataset=pd.get_dummies(dataset,drop_first=True) # Removing the repeated value column

In [5]:
dataset # Once the categorical columns converted into numerical column values

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']] # separating the input values
dep=dataset['charges'] # separating the output values

In [7]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(indep, dep, 
                                                    test_size = 1/3, random_state = 0)

In [8]:
# Data Preprocessing
# converting irrelavant numerical to a relevant numerical values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# Consolidating the playable parameters probability and also considering the cross validation
# Considering different portions of a data set as a test and traint set
from sklearn.model_selection import GridSearchCV 
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['mse','mae','friedman_mse'],
'max_features': ['auto','sqrt','log2'],
'splitter':['best','random']} # Giving all possible HTPs
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
# refit = True means consider the best result model as a result model
# refit = False means consider the last combination model as a result model
# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             verbose=3)

In [10]:
# print best parameter after tuning
print(grid.best_params_)
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

{'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'}
The R_score value for best parameter {'criterion': 'friedman_mse', 'max_features': 'auto', 'splitter': 'random'}:


In [11]:
re


{'mean_fit_time': array([0.02187371, 0.0156251 , 0.00624943, 0.00937428, 0.01250072,
        0.00937347, 0.0374979 , 0.02812419, 0.01875067, 0.01249914,
        0.01562424, 0.01562386, 0.00937424, 0.00625024, 0.00625005,
        0.00312567, 0.00312591, 0.00625024]),
 'std_fit_time': array([7.65414951e-03, 2.34863479e-06, 7.65395465e-03, 1.24990463e-02,
        6.25035763e-03, 7.65340974e-03, 7.65374063e-03, 6.24995248e-03,
        6.24883207e-03, 6.24957096e-03, 8.44957597e-07, 1.53331139e-06,
        7.65403274e-03, 7.65494768e-03, 7.65471386e-03, 6.25133514e-03,
        6.25181198e-03, 7.65494758e-03]),
 'mean_score_time': array([0.00312543, 0.        , 0.0062501 , 0.00312529, 0.        ,
        0.00312567, 0.        , 0.00312552, 0.00312443, 0.00312552,
        0.        , 0.00625072, 0.        , 0.        , 0.00312419,
        0.        , 0.        , 0.        ]),
 'std_score_time': array([0.00625086, 0.        , 0.00765477, 0.00625057, 0.        ,
        0.00625134, 0.        , 

In [12]:
table=pd.DataFrame.from_dict(re) # converting the dictionary format into readable format

In [13]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021874,0.00765415,0.003125,0.006251,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",0.679653,0.477308,0.744481,0.610653,0.664095,0.635238,0.089762,5
1,0.015625,2.348635e-06,0.0,0.0,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",0.647768,0.57792,0.679514,0.586931,0.636579,0.625742,0.038177,7
2,0.006249,0.007653955,0.00625,0.007655,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.595681,0.594904,0.634687,0.578947,0.481748,0.577193,0.051135,11
3,0.009374,0.01249905,0.003125,0.006251,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.684014,0.476552,0.605182,0.531057,0.52821,0.565003,0.07226,13
4,0.012501,0.006250358,0.0,0.0,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",0.665982,0.579686,0.672155,0.649538,0.621956,0.637863,0.033884,4
5,0.009373,0.00765341,0.003126,0.006251,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",0.515702,0.471248,0.570036,0.62268,0.545436,0.54502,0.050909,17
6,0.037498,0.007653741,0.0,0.0,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",0.656754,0.528775,0.694702,0.599682,0.651828,0.626348,0.057407,6
7,0.028124,0.006249952,0.003126,0.006251,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",0.593244,0.688882,0.694482,0.592617,0.660251,0.645895,0.044778,3
8,0.018751,0.006248832,0.003124,0.006249,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.580485,0.713543,0.66239,0.509062,0.578005,0.608697,0.071471,9
9,0.012499,0.006249571,0.003126,0.006251,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.661435,0.597916,0.607871,0.556298,0.569109,0.598526,0.036589,10


In [14]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:56
BMI:23
Children:1
Sex Male 0 or 1:1
Smoker Yes 0 or 1:1


In [15]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[15230.32405]
