In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('archive/insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
## Changing the values of the smoker column from the yes or no to 1 or 0
df['smoker'] = df['smoker'].replace({'yes': 1, 'no': 0})

  df['smoker'] = df['smoker'].replace({'yes': 1, 'no': 0})


In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [6]:
x = df.iloc[ : , : -1].values
y = df.iloc[ : , -1].values

## Checking if there is missing value or not

In [7]:
missing_values = df.isnull().sum()
print("Missing values in the dataset:")
print(missing_values)

Missing values in the dataset:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


## Classifying Data

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers =[('encoder',OneHotEncoder(),[1,5])],remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [9]:
testX = pd.DataFrame(x)

In [10]:
testX.head(10) ## first 2 column for the age next 2 for the smoker next 4 for the region 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.0,0.0,0.0,0.0,1.0,19,27.9,0,1
1,0.0,1.0,0.0,0.0,1.0,0.0,18,33.77,1,0
2,0.0,1.0,0.0,0.0,1.0,0.0,28,33.0,3,0
3,0.0,1.0,0.0,1.0,0.0,0.0,33,22.705,0,0
4,0.0,1.0,0.0,1.0,0.0,0.0,32,28.88,0,0
5,1.0,0.0,0.0,0.0,1.0,0.0,31,25.74,0,0
6,1.0,0.0,0.0,0.0,1.0,0.0,46,33.44,1,0
7,1.0,0.0,0.0,1.0,0.0,0.0,37,27.74,3,0
8,0.0,1.0,1.0,0.0,0.0,0.0,37,29.83,2,0
9,1.0,0.0,0.0,1.0,0.0,0.0,60,25.84,0,0


## Training 

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 2 )

In [12]:
print("Trainig Data..")
print("The shape of training features: ", x_train.shape)
print("The shape of training labels: ", y_train.shape)

Trainig Data..
The shape of training features:  (1003, 10)
The shape of training labels:  (1003,)


In [13]:
print("Testing Data..")
print("The shape of testing features: ", x_test.shape)
print("The shape of testing labels: ", y_test.shape)

Testing Data..
The shape of testing features:  (335, 10)
The shape of testing labels:  (335,)


## Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [15]:
training_acc = np.ceil(regressor.score(x_train, y_train) * 100)
print("The Training Accuracy = ", training_acc, "%")

The Training Accuracy =  76.0 %


In [16]:
y_pred = regressor.predict(x_test)

In [17]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 1.98e+03  2.40e+03]
 [ 1.21e+04  1.13e+04]
 [ 1.05e+04  9.62e+03]
 [ 2.36e+03  1.72e+03]
 [ 8.34e+03  8.89e+03]
 [ 1.13e+04  5.66e+03]
 [ 3.39e+03  1.26e+04]
 [ 1.36e+03  2.20e+03]
 [ 1.21e+04  1.43e+04]
 [ 9.63e+03  7.21e+03]
 [ 1.15e+04  1.22e+04]
 [ 5.21e+03  2.22e+03]
 [ 3.00e+04  1.94e+04]
 [-3.03e+02  1.12e+03]
 [ 1.30e+04  2.64e+04]
 [ 1.32e+04  1.29e+04]
 [ 4.25e+03  3.65e+03]
 [ 7.94e+03  5.33e+03]
 [ 2.93e+04  1.90e+04]
 [ 3.04e+03  2.20e+03]
 [ 1.24e+04  1.18e+04]
 [ 2.13e+03  2.60e+03]
 [ 3.40e+04  4.02e+04]
 [ 3.22e+04  2.23e+04]
 [ 3.03e+04  3.75e+04]
 [ 8.53e+03  1.06e+04]
 [ 2.55e+03  3.21e+03]
 [ 1.53e+04  8.94e+03]
 [ 6.41e+03  3.58e+03]
 [ 2.09e+03  3.18e+03]
 [ 9.69e+03  5.40e+03]
 [ 5.83e+03  3.96e+03]
 [ 4.27e+03  4.93e+03]
 [ 4.84e+03  4.24e+03]
 [ 9.70e+03  1.45e+04]
 [ 4.89e+03  1.63e+03]
 [ 3.00e+04  3.88e+04]
 [ 6.40e+03  2.04e+04]
 [ 2.75e+04  3.51e+04]
 [ 1.39e+04  2.70e+04]
 [ 3.29e+02  2.12e+03]
 [ 2.79e+04  3.45e+04]
 [ 7.81e+03  7.42e+03]
 [ 1.32e+03

In [18]:
testing_acc = np.ceil(regressor.score(x_test, y_test) * 100)
print("The Testing_acc Accuracy = ", testing_acc, "%")

The Testing_acc Accuracy =  75.0 %


In [19]:
print(regressor.coef_)
print(regressor.intercept_)

[  103.9   -103.9    569.61   147.83  -261.04  -456.4    247.37   333.41
   609.75 23746.35]
-12128.986167337522
