In [3]:
# import pandas library

import pandas as pd

In [4]:
# load dataset

dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [5]:
# Categorical input data
# Nominal - one hot encoding

dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
#split input data

independent=dataset[['age', 'bmi', 'children',  'sex_male','smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [7]:
# split output data

dependent=dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [8]:
# Split training and testing data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [9]:
# Standardization

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [10]:
# Model Creation - SVM
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’
# C: float, default=1.0

# gridsearch and cross validation Model  Creation

# class sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVR

#Python dictionaries - are used to store data values in key:value pairs

param_grid={'kernel':['linear','rbf','poly','sigmoid'],
             'C':[10,100,1000,2000,3000],
             'gamma':['auto','scale']}

grid=GridSearchCV(SVR(), param_grid, refit=True, verbose=3, n_jobs=-1)


grid.fit(x_train,y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


  y = column_or_1d(y, warn=True)


In [11]:
# Print hyper tuning parameter and r2_score


#cv_results_ - sklearn attributes
# convert the table into dictionary fornat
# A dictionary with keys- column headers and values as columns

result=grid.cv_results_

# Evaluation Metrics

y_pred=grid.predict(x_test)

# r2_score

# best_params_ - sklearn attributes
# parameter gave the best results only if refit is specified

# format method in python - formats the specified value and insert them inside the string placeholders
# format placeholders is defined using {}


from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
print("The r2_score with hyper tuning parameter : {} " .format(grid.best_params_) ,r_score)


The r2_score with hyper tuning parameter : {'C': 3000, 'gamma': 'scale', 'kernel': 'poly'}  0.859893008449439


In [12]:
# view in Table Format

# pandas data frame is a 2D two dimensional data structure like two dimensional array
# dataframe - a table with rows and columns

# from_dict() - create a pandas dataframe from python dictionary

table=pd.DataFrame.from_dict(result)
table

#rank test_score - 1 gives the best result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.341444,0.080847,0.040399,0.006399,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",0.377969,0.479601,0.317872,0.337979,0.324422,0.367569,0.059777,25
1,0.214905,0.012822,0.111462,0.003526,10,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.004055,0.013366,-0.103821,-0.095119,-0.101604,-0.056625,0.053504,35
2,0.171255,0.005454,0.041895,0.011468,10,auto,poly,"{'C': 10, 'gamma': 'auto', 'kernel': 'poly'}",0.056274,0.069532,-0.045601,-0.025079,-0.049592,0.001107,0.051309,32
3,0.23328,0.018434,0.046052,0.005785,10,auto,sigmoid,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.049905,0.075905,-0.046585,-0.041004,-0.046507,-0.001657,0.053391,34
4,0.178913,0.010636,0.030078,0.004245,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.377969,0.479601,0.317872,0.337979,0.324422,0.367569,0.059777,25
5,0.216184,0.008779,0.111048,0.010427,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.004126,0.013244,-0.103775,-0.095165,-0.101602,-0.056634,0.053486,36
6,0.169146,0.004118,0.037775,0.006705,10,scale,poly,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}",0.054964,0.071297,-0.046513,-0.024157,-0.049652,0.001188,0.051594,31
7,0.27628,0.015425,0.05586,0.013222,10,scale,sigmoid,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.049644,0.076323,-0.046798,-0.040824,-0.046521,-0.001635,0.053474,33
8,0.257133,0.030198,0.032718,0.003088,100,auto,linear,"{'C': 100, 'gamma': 'auto', 'kernel': 'linear'}",0.584474,0.655818,0.55338,0.576939,0.546881,0.583498,0.038787,19
9,0.203727,0.006909,0.110855,0.006327,100,auto,rbf,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}",0.300573,0.339474,0.173708,0.217991,0.183375,0.243024,0.065733,29


In [13]:
#Output from EndUser

age=float(input("Age :"))
bmi=float(input("BMI :"))
children=int(input("Children :"))
sex_male=int(input("Sex_male 1 or 0 :"))
smoker_yes=int(input("Smoker 1 0r 0 :"))

Age :23
BMI :34
Children :2
Sex_male 1 or 0 :0
Smoker 1 0r 0 :0


In [14]:
#Output Prediction

Prediction=grid.predict([[age,bmi,children,sex_male,smoker_yes]])
print("Future Prediction {} ".format(Prediction))

Future Prediction [672636.24550213] 
