In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [5]:
# As the above dataset contains sex and smoker columns as a categorical value 
# We need to convert that into numerical value using below one hot encoding method.
dataset=pd.get_dummies(dataset,drop_first=True) # Removing the repeated value column

In [6]:
dataset # Once the categorical columns converted into numerical column values

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [7]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']] # separating the input values
dep=dataset['charges'] # separating the output values

In [9]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(indep, dep, 
                                                    test_size = 1/3, random_state = 0)

In [10]:
# Data Preprocessing
# converting irrelavant numerical to a relevant numerical values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
# Consolidating the playable parameters probability and also considering the cross validation
# Considering different portions of a data set as a test and traint set
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVR
param_grid = {'kernel':['rbf','poly','sigmoid','linear'],
'C':[10,100,1000,2000,3000],'gamma':['auto','scale']} # Giving all possible HTPs
grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
# refit = True means consider the best result model as a result model
# refit = False means consider the last combination model as a result model
# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid={'C': [10, 100, 1000, 2000, 3000],
                         'gamma': ['auto', 'scale'],
                         'kernel': ['rbf', 'poly', 'sigmoid', 'linear']},
             verbose=3)

In [13]:
# print best parameter after tuning
print(grid.best_params_)
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

{'C': 3000, 'gamma': 'scale', 'kernel': 'poly'}
The R_score value for best parameter {'C': 3000, 'gamma': 'scale', 'kernel': 'poly'}:


In [14]:
re


{'mean_fit_time': array([0.21347809, 0.08088751, 0.07510772, 0.06199389, 0.0769371 ,
        0.05136504, 0.07726593, 0.05319548, 0.06883736, 0.04687319,
        0.10540743, 0.12935567, 0.1041101 , 0.06479616, 0.07187266,
        0.06249833, 0.0781208 , 0.07499785, 0.07812161, 0.07282271,
        0.10208368, 0.0922689 , 0.08120108, 0.08405218, 0.13378386,
        0.20230427, 0.08437276, 0.13124304, 0.10937209, 0.20624204,
        0.11249557, 0.10312085, 0.11874504, 0.36561179, 0.09687195,
        0.11562123, 0.10312214, 0.23124099, 0.1031208 , 0.10624595]),
 'std_fit_time': array([9.21302084e-02, 8.37333522e-03, 6.97529411e-03, 8.82632222e-03,
        4.43030470e-03, 7.70929889e-03, 9.24865551e-03, 2.49620238e-03,
        6.37389662e-03, 2.37271410e-06, 3.86175624e-02, 1.26977988e-02,
        1.76249520e-02, 1.53783960e-02, 7.65469470e-03, 1.67233256e-06,
        2.00385107e-06, 1.16923608e-02, 9.88234395e-03, 8.60714399e-03,
        2.94518277e-02, 4.17699082e-03, 7.32078281e-03, 5.544

In [15]:
table=pd.DataFrame.from_dict(re) # converting the dictionary format into readable format

In [16]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.213478,0.09213,0.046428,0.016041,10,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",-0.004176,0.022594,-0.118956,-0.082926,-0.103473,-0.057387,0.056205,35
1,0.080888,0.008373,0.025106,0.009019,10,auto,poly,"{'C': 10, 'gamma': 'auto', 'kernel': 'poly'}",0.04742,0.077536,-0.060527,-0.009476,-0.050823,0.000826,0.054025,32
2,0.075108,0.006975,0.012822,0.006441,10,auto,sigmoid,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.044787,0.081689,-0.072355,-0.027541,-0.05147,-0.004978,0.058648,34
3,0.061994,0.008826,0.010572,0.003019,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",0.387624,0.461268,0.288301,0.34054,0.297825,0.355112,0.063693,25
4,0.076937,0.00443,0.033492,0.009155,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",-0.003956,0.022453,-0.119035,-0.082925,-0.10351,-0.057395,0.05623,36
5,0.051365,0.007709,0.012798,0.00254,10,scale,poly,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}",0.043648,0.07978,-0.059229,-0.009498,-0.050317,0.000877,0.053658,31
6,0.077266,0.009249,0.014816,0.000908,10,scale,sigmoid,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.043946,0.08223,-0.072132,-0.027546,-0.051337,-0.004968,0.058595,33
7,0.053195,0.002496,0.007423,0.004158,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.387624,0.461268,0.288301,0.34054,0.297825,0.355112,0.063693,25
8,0.068837,0.006374,0.02729,0.009787,100,auto,rbf,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}",0.303414,0.319385,0.155546,0.208414,0.161756,0.229703,0.069348,29
9,0.046873,2e-06,0.012499,0.00625,100,auto,poly,"{'C': 100, 'gamma': 'auto', 'kernel': 'poly'}",0.542212,0.566743,0.471172,0.537557,0.413719,0.506281,0.056081,22


In [17]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:25
BMI:15
Children:3
Sex Male 0 or 1:0
Smoker Yes 0 or 1:1


In [18]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[2324356.48119273]
