In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
label_sex = {
    'male': 1,
    'female': 0
}

label_smoker ={
    'yes' : 1,
    'no' : 0
}

label_region ={
    'northwest' : 0,
    'southwest' : 1,
    'southeast' : 2,
    'northeast' : 3 
}

df['sex'] = df['sex'].map(label_sex)
df['smoker'] = df['smoker'].map(label_smoker)
df['region'] = df['region'].map(label_region)

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,0,21984.47061
4,32,1,28.88,0,0,0,3866.8552


In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df[['age', 'bmi', 'region','sex', 'smoker', 'children']]
  
data_vif = pd.DataFrame()
data_vif["feature"] = X.columns
  
data_vif["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(data_vif)

    feature       VIF
0       age  7.588734
1       bmi  9.938670
2    region  2.789478
3       sex  2.002652
4    smoker  1.261195
5  children  1.799980


In [6]:
df = df.drop(df.columns[0], axis=1)
df = df.drop(df.columns[1], axis=1)
df.head()

Unnamed: 0,sex,children,smoker,region,charges
0,0,0,1,1,16884.924
1,1,1,0,2,1725.5523
2,1,3,0,2,4449.462
3,1,0,0,0,21984.47061
4,1,0,0,0,3866.8552


In [7]:
new_data = df
new_data.head()

Unnamed: 0,sex,children,smoker,region,charges
0,0,0,1,1,16884.924
1,1,1,0,2,1725.5523
2,1,3,0,2,4449.462
3,1,0,0,0,21984.47061
4,1,0,0,0,3866.8552


In [8]:
X = new_data.iloc[:, :-1].values
y = new_data.iloc[:, 4].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

y = y.reshape(len(y), 1)
y.shape

(1338, 1)

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

ytp = np.concatenate((y_test, y_pred))
ytp

array([ 3227.1211    , 36307.7983    ,  7371.772     ,  8125.7845    ,
       14571.8908    ,  7077.1894    , 13607.36875   , 20149.3229    ,
        9630.397     , 21098.55405   , 40419.0191    ,  4686.3887    ,
       13555.0049    ,  8835.26495   , 10806.839     ,  4433.9159    ,
        4399.731     , 46255.1125    , 25382.297     ,  6799.458     ,
       10928.849     ,  2154.361     ,  6272.4772    , 38709.176     ,
       11163.568     , 17178.6824    , 33471.97189   ,  1131.5066    ,
        4751.07      , 10577.087     , 19798.05455   , 10156.7832    ,
       21677.28345   , 43578.9394    , 13747.87235   ,  2196.4732    ,
       11674.13      ,  7954.517     , 21232.18226   , 27941.28758   ,
        7624.63      , 10825.2537    , 18963.17192   , 17878.90068   ,
        2150.469     , 32108.66282   ,  2741.948     , 18310.742     ,
        5261.46945   ,  8988.15875   , 27000.98473   , 16232.847     ,
        6358.77645   , 10796.35025   ,  7537.1639    ,  1708.92575   ,
      

In [10]:
r2_lr = r2_score(y_test, y_pred)
print(r2_lr)

0.5990868652569483
