In [1]:
#importing the libraries
import numpy as np #for array and matrices
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#Reading the Dataset
dataset = pd.read_csv("insurance_pre.csv")

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset = pd.get_dummies(dataset, drop_first = True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [6]:
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dependent = dataset[['charges']]

In [7]:
#split into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.30, random_state = 0)

In [8]:
#Standardisation
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#to calculate mean and variance
x_train = sc.fit_transform(x_train) #transform: to assign the transform values to x_train
x_test = sc.transform(x_test)

In [9]:
#GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion':['squared_error', 'friedman_mse', 'poisson' ], 'max_features':['auto', 'sqrt', 'log2'], 'splitter':['best', 'random']}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1) 
#fitting the model for grid search
grid.fit(x_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\User\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError

In [10]:
#print best parameter after tuning
#print(grid.best_params_)
result = grid.cv_results_

print("The R_score value for best parameter {}:".format(grid.best_params_))

The R_score value for best parameter {'criterion': 'poisson', 'max_features': 'log2', 'splitter': 'best'}:


In [11]:
table = pd.DataFrame.from_dict(result)

In [12]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0,0.0,0.0,0.0,squared_error,auto,best,"{'criterion': 'squared_error', 'max_features':...",,,,,,,,13
1,0.000673,0.000829,0.0,0.0,squared_error,auto,random,"{'criterion': 'squared_error', 'max_features':...",,,,,,,,13
2,0.010143,0.004923,0.000937,0.001874,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.629809,0.643403,0.67712,0.550342,0.513482,0.602831,0.061043,8
3,0.004745,0.004459,0.002342,0.004685,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.633907,0.616141,0.606731,0.546516,0.581001,0.596859,0.030423,9
4,0.010859,0.005619,0.001003,0.002005,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.727329,0.574621,0.511974,0.639831,0.670535,0.624858,0.074937,5
5,0.00089,0.001375,0.00448,0.004539,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.601663,0.659991,0.693138,0.505075,0.664783,0.62493,0.066897,4
6,0.001168,0.002337,0.0,0.0,friedman_mse,auto,best,"{'criterion': 'friedman_mse', 'max_features': ...",,,,,,,,13
7,0.001182,0.001405,0.0,0.0,friedman_mse,auto,random,"{'criterion': 'friedman_mse', 'max_features': ...",,,,,,,,13
8,0.006323,0.000663,0.001514,0.001874,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.687619,0.58363,0.672452,0.670807,0.619121,0.646726,0.039133,3
9,0.005058,0.002875,0.003963,0.002244,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.656904,0.501064,0.538192,0.581883,0.639801,0.583569,0.059007,11


In [13]:
age_input = int(input("Age: "))
bmi_input = float(input("BMI: "))
children_input = int(input("Children: "))
sex_male_input = int(input("Sex Male 0 or 1: "))
smoker_yes_input = int(input("Smoker yes 0 or 1: "))

Age: 19
BMI: 27.9
Children: 0
Sex Male 0 or 1: 0
Smoker yes 0 or 1: 1


In [14]:
new_data = [[age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]]

In [16]:
new_data_standardized = sc.transform(new_data)



In [18]:
Future_Prediction = grid.predict(new_data_standardized)
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[16884.924]
