## Multiple Linear Regression Model For Predicting Insurance Amount Based Upon Few Health Features

In [60]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



# Read CSV

In [61]:
df = pd.read_csv('D:\Personal\ML\MachineLearning\MultipleLinearRegression\insurance.csv')
print ('\n number of rows in dataframe are ' , df.shape[0])
print ('\n number of columns or features in dataframe are ' , df.shape[1])



 number of rows in dataframe are  1338

 number of columns or features in dataframe are  7


# EDA

In [62]:
# see few records of data frame 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [63]:
# seperating out independant variables
X = df.iloc[:,:-1]
X


Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [64]:
y = df.iloc[:,6]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

# Feature Engineering 

In [67]:
# convert categorical values into numerical 
male_or_female = pd.get_dummies(X['sex'],drop_first=False)
male_or_female

Unnamed: 0,female,male
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1333,0,1
1334,1,0
1335,1,0
1336,1,0


In [68]:
smoker_or_nonsmoker = pd.get_dummies(X['smoker'],drop_first=False)
smoker_or_nonsmoker

Unnamed: 0,no,yes
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
1333,1,0
1334,1,0
1335,1,0
1336,1,0


In [70]:
region_ = pd.get_dummies(X['region'],drop_first=False)
region_

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [72]:
# drop sex categorical column from dataset 
X=X.drop('sex',axis=1)
X[0:3]

KeyError: "['sex'] not found in axis"

In [73]:
# similarly drop smoker and region
X = X.drop('smoker',axis = 1)
X[0:3]

Unnamed: 0,age,bmi,children,region
0,19,27.9,0,southwest
1,18,33.77,1,southeast
2,28,33.0,3,southeast


In [74]:
# drop region column
X = X.drop('region', axis = 1)
X[0:3]

Unnamed: 0,age,bmi,children
0,19,27.9,0
1,18,33.77,1
2,28,33.0,3


In [75]:
# concat the numerical forms of these categorical columns
X=pd.concat([X,male_or_female],axis=1)
X[0:3]

Unnamed: 0,age,bmi,children,female,male
0,19,27.9,0,1,0
1,18,33.77,1,0,1
2,28,33.0,3,0,1


In [76]:
X=pd.concat([X,smoker_or_nonsmoker],axis=1)
X[0:3]

Unnamed: 0,age,bmi,children,female,male,no,yes
0,19,27.9,0,1,0,0,1
1,18,33.77,1,0,1,1,0
2,28,33.0,3,0,1,1,0


In [77]:
X=pd.concat([X,region_],axis=1)
X[0:3]

Unnamed: 0,age,bmi,children,female,male,no,yes,northeast,northwest,southeast,southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.77,1,0,1,1,0,0,0,1,0
2,28,33.0,3,0,1,1,0,0,0,1,0


In [78]:
# independent_feature = ['age','bmi','children','female','male','no','yes','northeast','northwest','southeast','southwest']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
model=scaler.fit(X)
X=model.transform(X)
print(X)

[[0.02173913 0.3212268  0.         ... 0.         0.         1.        ]
 [0.         0.47914985 0.2        ... 0.         1.         0.        ]
 [0.2173913  0.45843422 0.6        ... 0.         1.         0.        ]
 ...
 [0.         0.56201238 0.         ... 0.         1.         0.        ]
 [0.06521739 0.26472962 0.         ... 0.         0.         1.        ]
 [0.93478261 0.35270379 0.         ... 1.         0.         0.        ]]


# Train Test Split

In [79]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Model creation & Training

In [81]:
# now create the model and train it over training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [82]:
# now predict the values for the test data
y_pred = model.predict(X_test)
y_pred

array([1.11699271e+04, 9.48670909e+03, 3.81811231e+04, 1.62663133e+04,
       6.91464801e+03, 3.96348488e+03, 1.57939691e+03, 1.43852566e+04,
       9.01257970e+03, 7.50846068e+03, 4.49176728e+03, 1.02795839e+04,
       8.80129751e+03, 3.79802013e+03, 2.79262010e+04, 1.07151158e+04,
       1.12889756e+04, 6.10501768e+03, 8.24104117e+03, 2.71445089e+04,
       3.36440910e+04, 1.43551043e+04, 1.17372043e+04, 3.21374335e+04,
       4.17005913e+03, 9.25496051e+03, 1.08433751e+03, 9.80417085e+03,
       3.77104596e+03, 1.04318587e+04, 9.00931722e+03, 4.00749509e+04,
       1.56889543e+04, 1.38794545e+04, 2.47597127e+04, 5.16638285e+03,
       1.26109277e+04, 3.07691018e+04, 3.35498325e+04, 3.67154946e+03,
       3.97568613e+03, 3.98729942e+03, 3.05285774e+04, 3.95053023e+04,
       2.78105036e+04, 5.09258923e+03, 1.06042481e+04, 7.82952256e+03,
       3.59255553e+03, 1.02128745e+04, 5.72038147e+03, 3.42627499e+03,
       3.30210242e+04, 3.84738218e+04, 1.60534782e+04, 7.16491905e+03,
      

# Model score

In [84]:
from sklearn.metrics import r2_score # r2_score is used to compare the values which are predicted by model and actual values
                                        # so that we can find out accuracy of our model
score=r2_score(y_test,y_pred)
score

0.7999876970680433