In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error, mean_squared_error

In [13]:
df = pd.read_csv('../data-sets/insurance.csv')
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [15]:
def map_to_replace(columns):
    for col in columns:
        if df[col].dtype == 'object':
            uniqValue = {}
            i = 0
            for unique_val in df[col].unique():
                uniqValue[str(unique_val)] = i
                i += 1
            df[col] = df[col].map(uniqValue)
            uniqValue = {}
            i = 0
            
map_to_replace(np.array(['sex', 'smoker', 'region']))
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,0,0,16884.924
1,18,1,33.77,1,1,1,1725.5523
2,28,1,33.0,3,1,1,4449.462
3,33,1,22.705,0,1,2,21984.47061
4,32,1,28.88,0,1,2,3866.8552


In [16]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,-0.020856,0.109272,0.042469,0.025019,-0.002127,0.299008
sex,-0.020856,1.0,0.046371,0.017163,-0.076185,-0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,-0.00375,-0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,-0.007673,-0.016569,0.067998
smoker,0.025019,-0.076185,-0.00375,-0.007673,1.0,-0.002181,-0.787251
region,-0.002127,-0.004588,-0.157566,-0.016569,-0.002181,1.0,0.006208
charges,0.299008,0.057292,0.198341,0.067998,-0.787251,0.006208,1.0


In [18]:
df.drop(labels=['sex', 'children', 'region'], axis=1, inplace=True)
df.corr()

Unnamed: 0,age,bmi,smoker,charges
age,1.0,0.109272,0.025019,0.299008
bmi,0.109272,1.0,-0.00375,0.198341
smoker,0.025019,-0.00375,1.0,-0.787251
charges,0.299008,0.198341,-0.787251,1.0


In [20]:
X = df.drop(labels='charges', axis=1)
y = df['charges']
print(X.shape, y.shape)

(1338, 3) (1338,)


In [21]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_, model.intercept_)

[   259.41020462    326.45023215 -23675.37184666] 11967.56951207214


In [29]:
y_pred = model.predict(x_test)

In [30]:
r2Score = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = root_mean_squared_error(y_test, y_pred)

print(r2Score, MSE, RMSE)

0.7776932310583374 34512843.8802279 5874.763304187489
