# Simple Linear regression

Predicting the insurance charges

## Importing the libraries

In [39]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

## Importing the dataset

In [41]:
data = pd.read_csv("insurance.csv")

In [42]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Changing the type from object to category

In [5]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
data[["sex","smoker","region"]] = data[["sex","smoker","region"]].astype("category")

In [7]:
data.dtypes

age            int64
sex         category
bmi          float64
children       int64
smoker      category
region      category
charges      float64
dtype: object

## labelling the dataset

In [44]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
label.fit(data.sex.drop_duplicates())
data.sex = label.transform(data.sex)
label.fit(data.smoker.drop_duplicates())
data.smoker = label.transform(data.smoker)
label.fit(data.region.drop_duplicates())
data.region = label.transform(data.region)
data.dtypes

age           int64
sex           int32
bmi         float64
children      int64
smoker        int32
region        int32
charges     float64
dtype: object

## Splitting the dataset

In [9]:
x = data.drop(columns=["charges"])

In [10]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [11]:
y = data.charges.values

In [12]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

## Importing the model

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Splitting the data into training and testing

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=22)

In [60]:
x_train.shape

(1070, 6)

In [61]:
x_test.shape

(268, 6)

In [62]:
y_train.shape

(1070,)

In [63]:
y_test.shape

(268,)

## Loading the data into the model

In [64]:
lm = LinearRegression()

In [20]:
lm.fit(x_train,y_train)

LinearRegression()

## Predicting the charges

In [21]:
y_pred = lm.predict(x_test)

In [22]:
check = pd.DataFrame(x_test, columns = ["age","sex","bmi","children","smoker"])
check

Unnamed: 0,age,sex,bmi,children,smoker
578,52,1,30.200,1,0
610,47,0,29.370,1,0
569,48,1,40.565,2,1
1034,61,1,38.380,0,0
198,51,0,18.050,0,0
...,...,...,...,...,...
1084,62,0,30.495,2,0
726,41,1,28.405,1,0
1132,57,1,40.280,0,0
725,30,0,39.050,3,1


In [23]:
check["Charges_Actual"] = y_test
check

Unnamed: 0,age,sex,bmi,children,smoker,Charges_Actual
578,52,1,30.200,1,0,9724.53000
610,47,0,29.370,1,0,8547.69130
569,48,1,40.565,2,1,45702.02235
1034,61,1,38.380,0,0,12950.07120
198,51,0,18.050,0,0,9644.25250
...,...,...,...,...,...,...
1084,62,0,30.495,2,0,15019.76005
726,41,1,28.405,1,0,6664.68595
1132,57,1,40.280,0,0,20709.02034
725,30,0,39.050,3,1,40932.42950


In [24]:
check["Charges_Predicted"] = y_pred
check

Unnamed: 0,age,sex,bmi,children,smoker,Charges_Actual,Charges_Predicted
578,52,1,30.200,1,0,9724.53000,11017.230479
610,47,0,29.370,1,0,8547.69130,9787.530168
569,48,1,40.565,2,1,45702.02235,37994.782118
1034,61,1,38.380,0,0,12950.07120,16122.778407
198,51,0,18.050,0,0,9644.25250,6930.759230
...,...,...,...,...,...,...,...
1084,62,0,30.495,2,0,15019.76005,14699.098767
726,41,1,28.405,1,0,6664.68595,8210.854549
1132,57,1,40.280,0,0,20709.02034,16019.284542
725,30,0,39.050,3,1,40932.42950,33104.944084


## Checking the accuracy

In [25]:
lm.score(x_test,y_test)

0.7998747145449959