# Importing the libraries

In [4]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
# importing the sklearn library for model training 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("insurance.csv")

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## Preparing the X and Y variable
- X here is a independent variable and Y is a dependent variable 

In [7]:
x = df.drop(columns = "expenses", axis = 1)

In [8]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


- We have removed the dependent variable

In [9]:
print("Categories in 'sex' variable:     ",end= " " )
print(df['sex'].unique())

print("Categories in 'smoker' variable:     ",end=" " )
print(df['smoker'].unique())

print("Categories in 'region' variable:     ",end=" " )
print(df['region'].unique())

print("Categories in 'children' variable:     ",end=" " )
print(df['children'].unique())

Categories in 'sex' variable:      ['female' 'male']
Categories in 'smoker' variable:      ['yes' 'no']
Categories in 'region' variable:      ['southwest' 'southeast' 'northwest' 'northeast']
Categories in 'children' variable:      [0 1 3 2 5 4]


In [10]:
y = df["expenses"]

In [11]:
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

## Data preprocessing 
- Using onehotencoding

In [12]:
x = x.replace({'sex':{'male':0,'female':1}})
x = x.replace({'smoker':{'yes':0,'no':1}})
x = x.replace({'region':{'southwest':0,'southeast':1,'northwest':2,'northeast':3}})

In [13]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,0,0
1,18,0,33.8,1,1,1
2,28,0,33.0,3,1,1
3,33,0,22.7,0,1,2
4,32,0,28.9,0,1,2


- We have treated all the categorical features

#### Splitting the data into train and test

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2,random_state = 2)

In [15]:
print(x.shape,X_train.shape,X_test.shape)

(1338, 6) (1070, 6) (268, 6)


#### Now we will be using the linear regression model

In [16]:
Linear = LinearRegression()

In [17]:
Linear.fit(X_train,Y_train)

#### Model Evaluation
1. Predction on the training data 

In [18]:
training_data_prediction = Linear.predict(X_train)

2. Checking the metrics on train data 

In [19]:
r2_train = r2_score(Y_train,training_data_prediction)
MAE = mean_absolute_error(Y_train,training_data_prediction)
MSE = mean_squared_error(Y_train,training_data_prediction)
print('r2 score ' , r2_train)
print('MAE ',  MAE)
print('MSE ',  MSE)

r2 score  0.7520097757615719
MAE  4139.766847951047
MSE  36101588.78531806


3. Predction on test data 

In [20]:
test_data_prediction = Linear.predict(X_test)

4. Checking the metrics on test data

In [21]:
r2_test = r2_score(Y_test,test_data_prediction)
MAE = mean_absolute_error(Y_test,test_data_prediction)
MSE = mean_squared_error(Y_test,test_data_prediction)
print('r2 score ' ,r2_test)
print('MAE ',  MAE)
print('MSE ',  MSE)

r2 score  0.7445469182584412
MAE  4285.60426571056
MSE  38364137.44093548
