In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
import joblib
from sklearn.metrics import mean_squared_error, r2_score

pd.pandas.set_option('display.max_columns', None)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

In [3]:
data = pd.read_csv("../raw/test.csv")

In [4]:
data.head()

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,574,575,53.0,female,27.7,88,Yes,0,No,northwest,8026.67
1,661,662,43.0,female,28.9,84,Yes,1,No,northwest,9249.5
2,458,459,34.0,female,40.6,98,No,1,No,northwest,6373.56
3,1023,1024,51.0,female,41.3,98,No,0,No,northeast,17878.9
4,958,959,28.0,female,25.1,103,No,0,No,northwest,14254.61


In [5]:
data.dtypes

index              int64
PatientID          int64
age              float64
gender            object
bmi              float64
bloodpressure      int64
diabetic          object
children           int64
smoker            object
region            object
claim            float64
dtype: object

In [6]:
data.drop(['index', 'PatientID'], axis=1, inplace=True)

In [7]:
len(data)

268

These steps are needed to be handled.

- Missing value
- k-1 one hot encoding
- scaling

## Missing values

### Categorical values

In [8]:
data['region'].fillna('southeast', inplace=True)

### Numerical values

In [9]:
data['age'].fillna(38.0, inplace=True)

In [10]:
## Feature transformation

In [11]:
# data['bloodpressure'] = stats.yeojohnson(data['bloodpressure'], lmbda=-3.3474597314221075)

## k-1 one hot encoding

In [12]:
cat_vars = ['gender', 'diabetic', 'smoker', 'region']

In [13]:
for var in cat_vars:
    
    data = pd.concat([data, pd.get_dummies(data[var],
                                                drop_first=True, prefix=var)], axis=1)

In [14]:
data.drop(cat_vars, axis=1, inplace=True)

In [15]:
data.head()

Unnamed: 0,age,bmi,bloodpressure,children,claim,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,53.0,27.7,88,0,8026.67,0,1,0,1,0,0
1,43.0,28.9,84,1,9249.5,0,1,0,1,0,0
2,34.0,40.6,98,1,6373.56,0,0,0,1,0,0
3,51.0,41.3,98,0,17878.9,0,0,0,0,0,0
4,28.0,25.1,103,0,14254.61,0,0,0,1,0,0


In [16]:
data.dtypes

age                 float64
bmi                 float64
bloodpressure         int64
children              int64
claim               float64
gender_male           uint8
diabetic_Yes          uint8
smoker_Yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object

In [17]:
y = data["claim"]
x = data.drop(["claim"], axis=1)

In [18]:
y.head()

0     8026.67
1     9249.50
2     6373.56
3    17878.90
4    14254.61
Name: claim, dtype: float64

In [19]:
x.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,53.0,27.7,88,0,0,1,0,1,0,0
1,43.0,28.9,84,1,0,1,0,1,0,0
2,34.0,40.6,98,1,0,0,0,1,0,0
3,51.0,41.3,98,0,0,0,0,0,0,0
4,28.0,25.1,103,0,0,0,0,1,0,0


## Scaling

In [20]:
scaler = joblib.load("../models/minmax_scaler.joblib")

In [21]:
x = pd.DataFrame(scaler.transform(x), columns=x.columns)

In [22]:
x.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,0.833333,0.319672,0.133333,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.595238,0.352459,0.066667,0.2,0.0,1.0,0.0,1.0,0.0,0.0
2,0.380952,0.672131,0.3,0.2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.785714,0.691257,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.238095,0.248634,0.383333,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Prediction

In [23]:
model = joblib.load("../models/model.joblib")

In [24]:
x.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,0.833333,0.319672,0.133333,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.595238,0.352459,0.066667,0.2,0.0,1.0,0.0,1.0,0.0,0.0
2,0.380952,0.672131,0.3,0.2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.785714,0.691257,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.238095,0.248634,0.383333,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
pred = model.predict(x)

In [26]:
len(np.exp(pred))

268

In [27]:
len(y.values)

268

In [28]:
np.exp(pred)[20:30], y[20:30]

(array([ 4573.84938394, 12735.80115635, 37820.02984304,  5613.87171603,
         6420.51467641, 12046.76466518, 40009.41150282,  7806.45256542,
         6235.21505569, 20621.087086  ]),
 20    13770.10
 21    24227.34
 22    39727.61
 23     7147.47
 24     5630.46
 25    20177.67
 26    43254.42
 27    36580.28
 28     7633.72
 29    21978.68
 Name: claim, dtype: float64)

In [30]:
mse = int(mean_squared_error(y, np.exp(pred)))
print('train mse: {}'.format(mse))

rmse = int(mean_squared_error(y, np.exp(pred), squared=False))
print('train rmse: {}'.format(rmse))

r2 = r2_score(y, np.exp(pred))
print('train r2: {}'.format(r2))
print()

train mse: 36517171
train rmse: 6042
train r2: 0.734257112684457

