In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv("medical_cost.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(1338, 7)

In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
d = df.select_dtypes(include=['object'])
d.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [13]:
ohe = OneHotEncoder()
encoded_label = ohe.fit_transform(d).toarray()

In [14]:
encoded_label

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [17]:
df_encoded = pd.DataFrame(encoded_label)

In [18]:
df.drop(columns=d.columns,inplace=True)

In [19]:
df1 = pd.concat([df_encoded,df],ignore_index=True, axis=1)

In [21]:
df1.shape

(1338, 12)

In [26]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19,27.9,0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18,33.77,1,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28,33.0,3,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33,22.705,0,21984.47061
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,32,28.88,0,3866.8552


In [27]:
X = df1.iloc[:,:-1]
y = df1.iloc[:,-1]

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1, random_state=42)

In [30]:
lm = LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [31]:
lm.coef_

array([   132.69566935,   -132.69566935, -11965.00470289,  11965.00470289,
          499.12914183,    206.61806452,   -337.05703406,   -368.69017229,
          259.23260146,    327.3528809 ,    443.28426899])

In [32]:
lm.intercept_

-277.1558755358783

In [34]:
y_pred = lm.predict(X_test)

print(f"r2 score is {r2_score(y_test,y_pred)}")
print(f"mean_squared_error is {mean_squared_error(y_test,y_pred)}")

r2 score is 0.7660412422965013
mean_squared_error is 32275850.350935332
