The purposes of this exercise to look into different features to observe their relationship, and plot a multiple linear regression based on several features of individual such as age, physical/family condition and location against their existing medical expense to be used for predicting future medical expenses of individuals that help medical insurance to make decision on charging the premium.

## Problem statement: Predicting future medical expenses of individuals based on the given dataset to help medical insurance to make decision on charging the premium.   

In [123]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
 


In [124]:
data=pd.read_csv("insurance.csv")

In [125]:
data.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
5,31,female,25.7,0,no,southeast,3756.62
6,46,female,33.4,1,no,southeast,8240.59
7,37,female,27.7,3,no,northwest,7281.51
8,37,male,29.8,2,no,northeast,6406.41
9,60,female,25.8,0,no,northwest,28923.14


In [126]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [127]:
data.duplicated().sum()

1

In [128]:
data.sex.replace({'male':0,'female':1},inplace=True)

In [129]:
data.smoker.replace({'yes':1,'no':0},inplace=True)

In [130]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
sex,1338.0,0.494768,0.50016,0.0,0.0,0.0,1.0,1.0
bmi,1338.0,30.665471,6.098382,16.0,26.3,30.4,34.7,53.1
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
smoker,1338.0,0.204783,0.403694,0.0,0.0,0.0,0.0,1.0
expenses,1338.0,13270.422414,12110.01124,1121.87,4740.2875,9382.03,16639.915,63770.43


In [131]:
data.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [132]:
data.region.replace({'southwest':1,'southeast':2,'northwest':3,'northeast':4},inplace=True)

In [133]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,1,27.9,0,1,1,16884.92
1,18,0,33.8,1,0,2,1725.55
2,28,0,33.0,3,0,2,4449.46
3,33,0,22.7,0,0,3,21984.47
4,32,0,28.9,0,0,3,3866.86
...,...,...,...,...,...,...,...
1333,50,0,31.0,3,0,3,10600.55
1334,18,1,31.9,0,0,4,2205.98
1335,18,1,36.9,0,0,2,1629.83
1336,21,1,25.8,0,0,1,2007.95


In [134]:
data.corr()['expenses']

age         0.299008
sex        -0.057292
bmi         0.198576
children    0.067998
smoker      0.787251
region      0.006208
expenses    1.000000
Name: expenses, dtype: float64

In [135]:
data.drop(columns=['sex','region'],inplace=True)

In [136]:
data

Unnamed: 0,age,bmi,children,smoker,expenses
0,19,27.9,0,1,16884.92
1,18,33.8,1,0,1725.55
2,28,33.0,3,0,4449.46
3,33,22.7,0,0,21984.47
4,32,28.9,0,0,3866.86
...,...,...,...,...,...
1333,50,31.0,3,0,10600.55
1334,18,31.9,0,0,2205.98
1335,18,36.9,0,0,1629.83
1336,21,25.8,0,0,2007.95


In [137]:

#Script to find the outliers
for col_name in data.select_dtypes(include=np.number).columns[:-1]:
    print(col_name)
    q1 = data[col_name].quantile(0.25)
    q3 = data[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1-1.5*iqr
    high = q3+1.5*iqr
    print("Outliers = ",data.loc[(data[col_name] < low) | (data[col_name] > high), col_name])

age
Outliers =  Series([], Name: age, dtype: int64)
bmi
Outliers =  116     49.1
286     48.1
401     47.5
543     47.4
847     50.4
860     47.6
1047    52.6
1088    47.7
1317    53.1
Name: bmi, dtype: float64
children
Outliers =  Series([], Name: children, dtype: int64)
smoker
Outliers =  0       1
11      1
14      1
19      1
23      1
       ..
1313    1
1314    1
1321    1
1323    1
1337    1
Name: smoker, Length: 274, dtype: int64


In [138]:
y=data['expenses']
type(y)

pandas.core.series.Series

In [139]:
data


Unnamed: 0,age,bmi,children,smoker,expenses
0,19,27.9,0,1,16884.92
1,18,33.8,1,0,1725.55
2,28,33.0,3,0,4449.46
3,33,22.7,0,0,21984.47
4,32,28.9,0,0,3866.86
...,...,...,...,...,...
1333,50,31.0,3,0,10600.55
1334,18,31.9,0,0,2205.98
1335,18,36.9,0,0,1629.83
1336,21,25.8,0,0,2007.95


In [152]:
data_x=data.copy()
x=data_x.drop(columns='expenses')

In [153]:
scale=StandardScaler()

In [154]:
data=pd.DataFrame(scale.fit_transform(data),columns=data.columns)

In [155]:
y=data['expenses']

In [156]:
x_col=data.drop(columns='expenses').columns

In [158]:
x=data[x_col]

In [164]:
def linear_best_fit(X,y):
    model = LinearRegression()    
    model.fit(X,y)
    y_pred = model.predict(X)
   #error metrics 
    print("MSE : ",mean_squared_error(y,y_pred))
    print("MAE : ",mean_absolute_error(y,y_pred))
    print("RMSE : ",np.sqrt(mean_squared_error(y,y_pred)))
    print("MAPE : ",mean_absolute_percentage_error(y,y_pred))
    print("r2_score: ",r2_score(y,y_pred))
    #n, p = X_train.shape[0], X_train.shape[1]
    #adjr2 = 1-(((1-r2)*(n-1))/(n-p-1))

def mean_absolute_percentage_error(y, y_pred): 
    y, y_pred = np.array(y), np.array(y_pred)
    return np.mean(np.abs((y - y_pred) / (y+0.01))) * 100



In [165]:
linear_best_fit(x,y)

MSE :  0.25029015374295766
MAE :  0.3451687375280481
RMSE :  0.5002900696025833
MAPE :  189.64386086942622
r2_score:  0.7497098462570424


In [None]:
r2_score(y,y_pred)