In [425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [426]:
%run LinearRegression.ipynb

In [427]:
data = pd.read_csv('data/insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [428]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [429]:
data_replaced = data.copy()
data_replaced.sex = data.sex.replace({'male':1, 'female':0})
data_replaced.smoker = data.smoker.replace({'yes':1, 'no':0})

In [430]:
data_got_dummies = pd.get_dummies(data_replaced, columns=['region'])

In [431]:
data_got_dummies

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,False,False,False,True
1,18,1,33.770,1,0,1725.55230,False,False,True,False
2,28,1,33.000,3,0,4449.46200,False,False,True,False
3,33,1,22.705,0,0,21984.47061,False,True,False,False
4,32,1,28.880,0,0,3866.85520,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,False,True,False,False
1334,18,0,31.920,0,0,2205.98080,True,False,False,False
1335,18,0,36.850,0,0,1629.83350,False,False,True,False
1336,21,0,25.800,0,0,2007.94500,False,False,False,True


In [432]:
normaliezed_columns = pd.DataFrame(Normalizer().fit_transform(data_got_dummies[['age', 'bmi']]), columns=[['age', 'bmi']])
normaliezed_columns

Unnamed: 0,age,bmi
0,0.562877,0.826541
1,0.470371,0.882469
2,0.646977,0.762509
3,0.823838,0.566825
4,0.742371,0.669989
...,...,...
1333,0.850131,0.526571
1334,0.491194,0.871050
1335,0.438904,0.898534
1336,0.631271,0.775562


In [433]:
final_data = data_got_dummies.copy()
final_data[['age', 'bmi']] = normaliezed_columns

In [434]:
features = final_data.replace({True:1, False:0}).drop(['charges'], axis=1)

In [435]:
features

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,0.562877,0,0.826541,0,1,0,0,0,1
1,0.470371,1,0.882469,1,0,0,0,1,0
2,0.646977,1,0.762509,3,0,0,0,1,0
3,0.823838,1,0.566825,0,0,0,1,0,0
4,0.742371,1,0.669989,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.850131,1,0.526571,3,0,0,1,0,0
1334,0.491194,0,0.871050,0,0,1,0,0,0
1335,0.438904,0,0.898534,0,0,0,0,1,0
1336,0.631271,0,0.775562,0,0,0,0,0,1


In [436]:
target = final_data.charges

## Train Test Split

In [437]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=.3, random_state=0) 

In [438]:
lr = LinearRegression()

In [439]:
lr.fit(x_train, y_train)

In [440]:
y_predicted = lr.predict(x_test)
r2_score(y_test, y_predicted)

0.6409104441389879

# Poly

In [441]:
%run LinearRegression.ipynb



In [442]:
plr = PolynomialRegression(degree=2)

In [443]:
plr.fit(x_train.values, y_train.values)

x shape is (936, 9)
(936, 1)
(936, 9)
(936, 9)


In [444]:
poly_predicted = plr.predict(x_test.values)

x shape is (402, 9)
(402, 1)
(402, 9)
(402, 9)


In [413]:
r2_score(y_test.values, poly_predicted)

0.6963273719157055

# Test: 20

In [446]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=0) 

In [447]:
lr = LinearRegression()

In [448]:
lr.fit(x_train, y_train)

In [449]:
y_predicted = lr.predict(x_test)
r2_score(y_test, y_predicted)

0.6439064415746245

# Poly

In [450]:
%run LinearRegression.ipynb



In [451]:
plr = PolynomialRegression(degree=2)

In [452]:
plr.fit(x_train.values, y_train.values)

x shape is (1070, 9)
(1070, 1)
(1070, 9)
(1070, 9)


In [453]:
poly_predicted = plr.predict(x_test.values)

x shape is (268, 9)
(268, 1)
(268, 9)
(268, 9)


In [454]:
r2_score(y_test.values, poly_predicted)

0.6967723644847328