# Predicting Medical Costs by Health Insurance

<b>Imports-</b>

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

<b>Data Analysis</b>

In [2]:
df = pd.read_csv('/Users/juveriashaik/Desktop/DATA ANALYTICS/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(1338, 7)

In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


<b>Encoding the categorical features to numerical</b>

In [6]:
oHE = OneHotEncoder()
df1 = df.select_dtypes(include = [object])
encoded = oHE.fit_transform(df1).toarray() 

In [7]:
# now we concatinate the labels.
encoded_data = pd.DataFrame(encoded)
df.drop(columns = ['sex','smoker','region'], inplace = True)

df1 = pd.concat([encoded_data, df], axis = 1, ignore_index = True)

In [8]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19,27.9,0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18,33.77,1,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28,33.0,3,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33,22.705,0,21984.47061
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,32,28.88,0,3866.8552


<b>Implimenting linear regression</b>

In [9]:
X = df1.iloc[:,:-1]
y = df1.iloc[:,-1]

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state = 7)
print(X_train.shape)
print(X_test.shape)


(1003, 11)
(335, 11)


In [10]:
lr = LinearRegression()
lr.fit(X_train,y_train)


<b>Getting the coefficients and Intercept</b>

In [12]:
print(lr.coef_)
print(lr.intercept_)

[    97.18113355    -97.18113355 -11905.32547314  11905.32547314
    667.24158209     15.86338029   -233.63531263   -449.46964974
    251.90247991    353.38540435    465.2280675 ]
-887.6917181755853


<b>Model Evauation</b>

In [14]:
y_pred = lr.predict(X_test)

print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred , squared = False))

0.7509741262661104
6080.977967616992
