In [1]:
# import pandas library

import pandas as pd

In [2]:
# load dataset

dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
# Categorical input data
# Nominal - one hot encoding

dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
#split input data

independent=dataset[['age', 'bmi', 'children',  'sex_male','smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
# split output data

dependent=dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [6]:
# Split training and testing data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [7]:
# Standardization

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [9]:
# GridSearchCVModel Creation - Random Forest

# n_estimators:int, default=100
# max_features{“sqrt”, “log2”, None}, int or float, default=1.0
# criterion{“squared_error”, “absolute_error”, “friedman_mse”, “poisson”}, default=”squared_error”

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid ={'criterion':['squared_error', 'friedman_mse','absolute_error','poisson'], 
              'max_features':['sqrt','log2',None],
              'n_estimators':[100,200]}

grid=GridSearchCV(RandomForestRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)

grid.fit(x_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [10]:
#Print r2_score with hyper tuning parameter

result=grid.cv_results_

# Evaluation Metrics

y_pred=grid.predict(x_test)

# r2_score

from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
print("The r2_score with hyper tuning parameter {}".format(grid.best_params_),r_score)

The r2_score with hyper tuning parameter {'criterion': 'squared_error', 'max_features': 'sqrt', 'n_estimators': 100} 0.8669939654818107


In [11]:
# View in table format

table=pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.471177,0.035322,0.075956,0.018034,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.866991,0.800385,0.811587,0.829947,0.769332,0.815648,0.032358,1
1,2.881666,0.026818,0.115205,0.009904,squared_error,sqrt,200,"{'criterion': 'squared_error', 'max_features':...",0.864559,0.795339,0.807342,0.83094,0.769583,0.813553,0.032273,5
2,1.438854,0.052389,0.064661,0.003333,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.865855,0.786223,0.807498,0.833995,0.767405,0.812195,0.034806,11
3,2.865328,0.053416,0.122581,0.005233,squared_error,log2,200,"{'criterion': 'squared_error', 'max_features':...",0.864273,0.790886,0.811488,0.827562,0.769345,0.812711,0.032366,10
4,1.81266,0.032658,0.066827,0.008556,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.860279,0.758627,0.817053,0.797009,0.769193,0.800432,0.036313,22
5,3.554523,0.053867,0.117384,0.009848,squared_error,,200,"{'criterion': 'squared_error', 'max_features':...",0.866505,0.767709,0.818856,0.805343,0.770569,0.805797,0.036188,17
6,1.438675,0.040604,0.063528,0.009838,friedman_mse,sqrt,100,"{'criterion': 'friedman_mse', 'max_features': ...",0.86504,0.787598,0.79905,0.826538,0.77012,0.809669,0.03321,16
7,2.898233,0.028363,0.119872,0.014531,friedman_mse,sqrt,200,"{'criterion': 'friedman_mse', 'max_features': ...",0.865896,0.788537,0.810644,0.826857,0.772196,0.812826,0.032431,8
8,1.494496,0.049637,0.062222,0.006016,friedman_mse,log2,100,"{'criterion': 'friedman_mse', 'max_features': ...",0.864281,0.78747,0.814369,0.828444,0.774689,0.813851,0.031579,3
9,2.933463,0.04373,0.129022,0.012227,friedman_mse,log2,200,"{'criterion': 'friedman_mse', 'max_features': ...",0.865623,0.787865,0.808614,0.83034,0.774667,0.813422,0.032216,6


In [12]:
#Output from User


age=float(input("Age :"))
bmi=float(input("BMI :"))
children=int(input("Children :"))
sex_male=int(input("Sex_male 1 or 0 :"))
smoker_yes=int(input("Smoker 1 0r 0 :"))

Age :2
BMI :2
Children :3
Sex_male 1 or 0 :1
Smoker 1 0r 0 :0


In [14]:
#Output Prediction

Prediction=grid.predict([[age,bmi,children,sex_male,smoker_yes]])
print("Future_Prediction {}".format(Prediction))

Future_Prediction [18693.0983874]
