In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
# As the above dataset contains sex and smoker columns as a categorical value 
# We need to convert that into numerical value using below one hot encoding method.
dataset=pd.get_dummies(dataset,drop_first=True) # Removing the repeated value column

In [5]:
dataset # Once the categorical columns converted into numerical column values

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']] # separating the input values
dep=dataset['charges'] # separating the output values

In [7]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(indep, dep, 
                                                    test_size = 1/3, random_state = 0)

In [8]:
# Data Preprocessing
# converting irrelavant numerical to a relevant numerical values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# Consolidating the playable parameters probability and also considering the cross validation
# Considering different portions of a data set as a test and traint set
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestRegressor
param_grid = {'criterion':['mse','mae'],
'max_features': ['auto','sqrt','log2'],
'n_estimators':[10,100]} # Giving all possible HTPs
grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
# refit = True means consider the best result model as a result model
# refit = False means consider the last combination model as a result model
# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 100]},
             verbose=3)

In [10]:
# print best parameter after tuning
print(grid.best_params_)
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

{'criterion': 'mae', 'max_features': 'log2', 'n_estimators': 100}
The R_score value for best parameter {'criterion': 'mae', 'max_features': 'log2', 'n_estimators': 100}:


In [11]:
re


{'mean_fit_time': array([0.05937343, 0.35936208, 0.03124881, 0.29998794, 0.04062467,
        0.28748932, 0.17499332, 1.60931826, 0.09999614, 0.98121719,
        0.10624619, 1.03653336]),
 'std_fit_time': array([1.16916221e-02, 2.42062999e-02, 5.84003864e-07, 1.16929089e-02,
        7.65292321e-03, 7.65335129e-03, 6.24868903e-03, 4.19247996e-02,
        7.65440269e-03, 1.82225588e-02, 6.24940420e-03, 5.01474914e-02]),
 'mean_score_time': array([0.        , 0.02187643, 0.0093749 , 0.02500081, 0.0031239 ,
        0.02500124, 0.00312533, 0.02187538, 0.00624981, 0.0218751 ,
        0.00625057, 0.02500172]),
 'std_score_time': array([0.        , 0.00765358, 0.00765458, 0.00765551, 0.00624781,
        0.00765577, 0.00625067, 0.00765405, 0.00765442, 0.00765351,
        0.00765536, 0.00765362]),
 'param_criterion': masked_array(data=['mse', 'mse', 'mse', 'mse', 'mse', 'mse', 'mae', 'mae',
                    'mae', 'mae', 'mae', 'mae'],
              mask=[False, False, False, False, False, Fal

In [12]:
table=pd.DataFrame.from_dict(re) # converting the dictionary format into readable format

In [13]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.059373,0.01169162,0.0,0.0,mse,auto,10,"{'criterion': 'mse', 'max_features': 'auto', '...",0.770927,0.731436,0.829589,0.80902,0.750547,0.778304,0.036318,9
1,0.359362,0.0242063,0.021876,0.007654,mse,auto,100,"{'criterion': 'mse', 'max_features': 'auto', '...",0.803984,0.747931,0.833916,0.810307,0.764771,0.792182,0.031366,5
2,0.031249,5.840039e-07,0.009375,0.007655,mse,sqrt,10,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.785398,0.76918,0.795484,0.815086,0.721456,0.777321,0.03165,10
3,0.299988,0.01169291,0.025001,0.007656,mse,sqrt,100,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.802309,0.76985,0.838034,0.827214,0.76384,0.80025,0.029694,3
4,0.040625,0.007652923,0.003124,0.006248,mse,log2,10,"{'criterion': 'mse', 'max_features': 'log2', '...",0.811838,0.74245,0.824952,0.77589,0.758247,0.782675,0.031301,8
5,0.287489,0.007653351,0.025001,0.007656,mse,log2,100,"{'criterion': 'mse', 'max_features': 'log2', '...",0.804669,0.770489,0.836053,0.828968,0.758067,0.799649,0.03095,4
6,0.174993,0.006248689,0.003125,0.006251,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.769252,0.755641,0.807703,0.801138,0.738911,0.774529,0.026313,11
7,1.609318,0.0419248,0.021875,0.007654,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.781457,0.769767,0.816909,0.808149,0.759021,0.787061,0.022146,7
8,0.099996,0.007654403,0.00625,0.007654,mae,sqrt,10,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.787194,0.732083,0.782124,0.799725,0.737965,0.767818,0.027446,12
9,0.981217,0.01822256,0.021875,0.007654,mae,sqrt,100,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.808449,0.769715,0.839186,0.824586,0.763085,0.801004,0.029954,2


In [14]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:22
BMI:11
Children:10
Sex Male 0 or 1:1
Smoker Yes 0 or 1:1


In [15]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[45486.8681582]
