In [22]:
# import library 
import pandas as pd
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.feature_selection import RFE

In [23]:
#load and read the dataset
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [24]:
dataset.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
charges     0
dtype: int64

In [25]:
# to find number of rows and col's
no_of_rows_and_col=dataset.shape
print("Number of rows and columns:",no_of_rows_and_col)
# Here the number of rows = 1338 and the columns = 6

Number of rows and columns: (1338, 6)


In [26]:
#to list the col names
dataset.columns
# here 'age', 'sex', 'bmi', 'children', 'smoker' - are input
#'charges' - are output

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [41]:
# # preprocessing to convert categorical value to numerical 

# dataset.sex[dataset.sex == 'male']=1
# dataset.sex[dataset.sex == 'female']=2

# dataset.smoker[dataset.smoker == 'yes']=1
# dataset.smoker[dataset.smoker == 'no']=0
# dataset

In [43]:
dataset = pd.get_dummies(dataset, drop_first = True)
dataset = dataset.astype(int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0
...,...,...,...,...,...,...
1333,50,30,3,10600,1,0
1334,18,31,0,2205,0,0
1335,18,36,0,1629,0,0
1336,21,25,0,2007,0,0


In [44]:
# assign input and output value
independent = dataset[['age', 'sex_male', 'bmi', 'children', 'smoker_yes']]
print(independent)
dependent = dataset[['charges']]
print(dependent)

      age  sex_male  bmi  children  smoker_yes
0      19         0   27         0           1
1      18         1   33         1           0
2      28         1   33         3           0
3      33         1   22         0           0
4      32         1   28         0           0
...   ...       ...  ...       ...         ...
1333   50         1   30         3           0
1334   18         0   31         0           0
1335   18         0   36         0           0
1336   21         0   25         0           0
1337   61         0   29         0           1

[1338 rows x 5 columns]
      charges
0       16884
1        1725
2        4449
3       21984
4        3866
...       ...
1333    10600
1334     2205
1335     1629
1336     2007
1337    29141

[1338 rows x 1 columns]


In [45]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0
...,...,...,...,...,...,...
1333,50,30,3,10600,1,0
1334,18,31,0,2205,0,0
1335,18,36,0,1629,0,0
1336,21,25,0,2007,0,0


In [46]:
# spliting training and testing data

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)
X_train,X_test,Y_train,Y_test

(      age  sex_male  bmi  children  smoker_yes
 1163   18         0   28         0           0
 196    39         0   32         0           0
 438    52         0   46         5           0
 183    44         0   26         0           0
 1298   33         1   27         2           0
 ...   ...       ...  ...       ...         ...
 763    27         1   26         0           0
 835    42         1   35         2           0
 1216   40         1   25         0           0
 559    19         1   35         0           0
 684    33         0   18         1           0
 
 [936 rows x 5 columns],
       age  sex_male  bmi  children  smoker_yes
 578    52         1   30         1           0
 610    47         0   29         1           0
 569    48         1   40         2           1
 1034   61         1   38         0           0
 198    51         0   18         0           0
 ...   ...       ...  ...       ...         ...
 1261   28         1   37         1           0
 494    21   

In [113]:

# creating a func for select k best
def selectKbest(indep_X, dep_Y,n):
    # Refer - https://scikit-learn.org/stable/modules/feature_selection.htmlhttps://scikit-learn.org/stable/modules/feature_selection.html
    from sklearn.feature_selection import SelectKBest,chi2
    test = SelectKBest(score_func = chi2, k=n) # parameters are chi2 and k value
    fit1=test.fit(indep_X,dep_Y)# fitting input and output to the model
    selectkbest= fit1.transform(indep_X)  # # Transform indep_X to select only for k value features
    return selectkbest

In [114]:

# creating the function for std scalar and input output split
def split_scalar(indep_X,dep_Y):
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    X_train,X_test,Y_train,Y_test = train_test_split(indep_X,dep_Y,test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

In [115]:
# creating a function for r2 and testing
def r2_prediction(regressor, X_test, Y_test):
    test_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2 = r2_score(Y_test, test_pred)
    return r2


In [116]:

# function for linear reg
def linear_reg(X_train,Y_train,X_test):
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train,Y_train)
    r2 = r2_prediction(regressor,X_test,Y_test)
    return r2
# func for svm
def svm_reg(X_train,Y_train,X_test):
    from sklearn.svm import SVR
    regressor = SVR(kernel ='linear')
    regressor.fit(X_train,Y_train)
    r2 = r2_prediction(regressor,X_test,Y_test)
    return r2
# func for decision tree
def decisiontree_reg(X_train,Y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state= 0)
    regressor.fit(X_train,Y_train)
    r2 =r2_prediction(regressor,X_test,Y_test)
    return r2
# func for randomforest
def randomforest_reg(X_train,Y_train,X_test):
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train,Y_train)
    r2 = r2_prediction(regressor, X_test,Y_test)
    return r2

In [117]:
# creating func fot df to store the value
def selectK_reg(r2score_lin,r2score_svm,r2score_dt,r2score_rf):
    df = pd.DataFrame(index =['ChiSquare'],columns=['Linear','SVM','DecisionTree','RandomForest'])
    df
    for number,idex in enumerate(df.index):
        df['Linear'][idex] = r2score_lin[number]
        df['SVM'][idex] = r2score_svm[number]
        df['DecisionTree'][idex]= r2score_dt[number]
        df['RandomForest'][idex]= r2score_rf[number]
    return df

In [118]:
indep_X=df2.drop('charges', axis =1)
dep_Y =df2['charges']

In [119]:
kbest=selectKbest(indep_X,dep_Y,5)
r2score_lin=[]
r2score_svm=[]
r2score_dt=[]
r2score_rf=[]

In [120]:
X_train,X_test,Y_train,Y_test=split_scalar(kbest,dep_Y)
for i in kbest:
    r2_lin=linear_reg(X_train,Y_train,X_test)
    r2score_lin.append(r2_lin)
    
    r2_svm=svm_reg(X_train,Y_train,X_test)
    r2score_svm.append(r2_svm)
    
    r2_dT=decisiontree_reg(X_train,Y_train,X_test)
    r2score_dt.append(r2_dT)
    
    r2_rf=randomforest_reg(X_train,Y_train,X_test)
    r2score_rf.append(r2_rf)


In [121]:
result = selectK_reg(r2score_lin,r2score_svm,r2score_dt,r2score_rf)
result# 5

Unnamed: 0,Linear,SVM,DecisionTree,RandomForest
ChiSquare,0.794807,-0.010792,0.742552,0.850445


In [122]:
#Model creation
from sklearn.ensemble import RandomForestRegressor 
# regressor=RandomForestRegressor(n_estimators=50,criterion='poisson')
regressor=RandomForestRegressor(n_estimators=100,criterion='absolute_error',max_features='log2')
regressor=regressor.fit(X_train,Y_train)

In [123]:
# testing
test_pred= regressor.predict(X_test)
test_pred

array([10568.5  ,  9313.175, 43879.235, 13033.76 ,  9752.77 , 12610.305,
        2315.545, 12409.41 ,  7510.505,  5042.78 ,  6482.56 , 11468.035,
        8834.95 ,  5935.905, 23992.655, 11058.015, 13251.795,  3660.285,
        6766.865, 34363.085, 25139.52 , 15282.195, 11178.075, 25913.03 ,
        3114.4  ,  7180.775,  5484.845,  8664.67 ,  3934.215, 11829.61 ,
        8889.51 , 46269.15 , 14769.795, 12147.495, 18213.78 ,  4074.025,
        9714.635, 36251.645, 38611.795,  2299.325,  6448.25 ,  3162.855,
       22058.77 , 45306.255, 36358.045,  4514.465, 11485.045,  6497.445,
        7396.715, 12485.785,  3583.885, 10266.28 , 26615.115, 46683.215,
       11734.245, 10288.695,  4220.035,  9510.405,  9950.595, 14749.935,
        1628.69 , 43818.53 , 15770.975, 19028.8  , 10898.315, 10167.295,
       34476.515, 38101.16 ,  5247.39 ,  8985.99 , 14597.725, 12013.895,
       19112.085, 14267.515, 13341.465, 13120.165, 10340.51 , 17698.935,
       20891.62 , 44558.5  , 11524.845, 45954.82 , 

In [124]:
#Evalution Metrics
from sklearn.metrics import r2_score
r2 = r2_score(Y_test,test_pred)
r2
# r2 should be nearer to 1 here it is not performing good for this dataset

0.8815759272813894

In [125]:
# Saving the model
import pickle
filename= "RandomForestRegressor_insurance.sav"

In [126]:
pickle.dump(regressor,open(filename,'wb'))
load_model=pickle.load(open("RandomForestRegressor_insurance.sav",'rb'))
result=load_model.predict([[12,1,45,3,1]])
result

array([44940.585])

# Deployement - Phase 2

In [127]:
import pickle
load_model=pickle.load(open("RandomForestRegressor_insurance.sav",'rb'))
result=load_model.predict([[32,2,15,2,1]])
result

array([43282.53])

In [128]:
import pickle
load_model=pickle.load(open("RandomForestRegressor_insurance.sav",'rb'))
result=load_model.predict([[0,0,0,0,0]])
result

array([6902.405])