In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

pd.pandas.set_option("display.max_columns", None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
import joblib

In [4]:
data = pd.read_csv("../raw/insurance_data.csv")

In [5]:
data.head()

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,0,1,39.0,male,23.2,91,Yes,0,No,southeast,1121.87
1,1,2,24.0,male,30.1,87,No,0,No,southeast,1131.51
2,2,3,,male,33.3,82,Yes,0,No,southeast,1135.94
3,3,4,,male,33.7,80,No,0,No,northwest,1136.4
4,4,5,,male,34.1,100,No,0,No,northwest,1137.01


In [6]:
data.shape

(1340, 11)

In [7]:
train, test = train_test_split(data,test_size=0.2,random_state=0)

In [8]:
train.head()

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
362,362,363,40.0,female,27.5,98,Yes,1,No,southwest,5003.85
483,483,484,48.0,male,42.4,92,Yes,5,No,southwest,6666.24
866,866,867,59.0,female,33.5,97,No,2,No,northwest,12269.69
625,625,626,26.0,male,27.4,92,No,3,No,northeast,8606.22
194,194,195,57.0,female,26.6,83,No,0,No,northeast,3046.06


In [9]:
test.head()

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
574,574,575,53.0,female,27.7,88,Yes,0,No,northwest,8026.67
661,661,662,43.0,female,28.9,84,Yes,1,No,northwest,9249.5
458,458,459,34.0,female,40.6,98,No,1,No,northwest,6373.56
1023,1023,1024,51.0,female,41.3,98,No,0,No,northeast,17878.9
958,958,959,28.0,female,25.1,103,No,0,No,northwest,14254.61


In [10]:
train.to_csv('../raw/train.csv', index=False)
test.to_csv('../raw/test.csv', index=False)

In [11]:
train.drop(['index', 'PatientID'], axis=1, inplace=True)
test.drop(['index', 'PatientID'], axis=1, inplace=True)

In [12]:
train.head()

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
362,40.0,female,27.5,98,Yes,1,No,southwest,5003.85
483,48.0,male,42.4,92,Yes,5,No,southwest,6666.24
866,59.0,female,33.5,97,No,2,No,northwest,12269.69
625,26.0,male,27.4,92,No,3,No,northeast,8606.22
194,57.0,female,26.6,83,No,0,No,northeast,3046.06


In [13]:
y_train = train["claim"]
X_train = train.drop("claim", axis=1)

In [14]:
y_test = test["claim"]
X_test = test.drop("claim", axis=1)

In [15]:
X_train.shape, X_test.shape

((1072, 8), (268, 8))

# Feature Engineering

1. Target
2. Missing value
3. Non-gaussian distributed variables
4. k-1 one hot encoding
5. Put the variables in a similar scale

In [16]:
cat_vars = ['gender', 'diabetic', 'smoker', 'region']
num_vars = ['age', 'bmi', 'bloodpressure', 'children']

## Target

In [17]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Missing value

The column which is categorical and has na value is **region** and the one which is numerical and has na value is **age**

In [18]:
cat_na = ['region']
num_na = ['age']

In [19]:
cat_na, num_na

(['region'], ['age'])

In [20]:
X_train[cat_na].isnull().sum() / len(data) * 100

region    0.149254
dtype: float64

In [21]:
X_train[num_na].isnull().sum() / len(data) * 100

age    0.298507
dtype: float64

Since the amount of the missing value is under 1% of the total value, so let's replace these missing value with  most frequent value for categorical column and numerical value

### categorical values

In [22]:
for var in cat_na:
    mode = X_train[var].mode()[0]
    print(var, mode)
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)

region southeast


In [23]:
X_train[cat_vars].isnull().sum()

gender      0
diabetic    0
smoker      0
region      0
dtype: int64

### numerical values

In [24]:
for var in num_na:
    mean = round(X_train[var].mean())
    print(var, mean)
    
    X_train[var].fillna(mean, inplace=True)
    X_test[var].fillna(mean, inplace=True)
    

age 38


In [25]:
X_train[num_vars].isnull().sum()

age              0
bmi              0
bloodpressure    0
children         0
dtype: int64

## Non-gaussian distributed values

As we can see from the EDA is that only blood pressure feature can be improved by using yeo johnson feature transformation

In [26]:
# transform_feat = ['bloodpressure']

In [27]:
# for var in transform_feat:
    
#     X_train[var], param = stats.yeojohnson(X_train[var])
    
#     X_test[var] = stats.yeojohnson(X_test[var], lmbda=param)
    
#     print(param)

## k-1 one hot encoding

In [28]:
cat_vars

['gender', 'diabetic', 'smoker', 'region']

In [29]:
for var in cat_vars:
    
    X_train = pd.concat([X_train, pd.get_dummies(X_train[var],
                                                drop_first=True, prefix=var)], axis=1)
    X_test = pd.concat([X_test, pd.get_dummies(X_test[var],
                                                drop_first=True, prefix=var)], axis=1)

In [30]:
X_train.drop(labels=cat_vars, axis=1, inplace=True)
X_test.drop(labels=cat_vars, axis=1, inplace=True)

In [31]:
X_train.isnull().sum()

age                 0
bmi                 0
bloodpressure       0
children            0
gender_male         0
diabetic_Yes        0
smoker_Yes          0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [32]:
X_test.isnull().sum()

age                 0
bmi                 0
bloodpressure       0
children            0
gender_male         0
diabetic_Yes        0
smoker_Yes          0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [33]:
X_train.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
362,40.0,27.5,98,1,0,1,0,0,0,1
483,48.0,42.4,92,5,1,1,0,0,0,1
866,59.0,33.5,97,2,0,0,0,1,0,0
625,26.0,27.4,92,3,1,0,0,0,0,0
194,57.0,26.6,83,0,0,0,0,0,0,0


In [34]:
X_test.head()

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
574,53.0,27.7,88,0,0,1,0,1,0,0
661,43.0,28.9,84,1,0,1,0,1,0,0
458,34.0,40.6,98,1,0,0,0,1,0,0
1023,51.0,41.3,98,0,0,0,0,0,0,0
958,28.0,25.1,103,0,0,0,0,1,0,0


## feature scaling

In [35]:
variables = X_train.columns.tolist()

In [36]:
variables

['age',
 'bmi',
 'bloodpressure',
 'children',
 'gender_male',
 'diabetic_Yes',
 'smoker_Yes',
 'region_northwest',
 'region_southeast',
 'region_southwest']

In [37]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=variables)
X_test = pd.DataFrame(scaler.transform(X_test), columns=variables)

In [38]:
X_train

Unnamed: 0,age,bmi,bloodpressure,children,gender_male,diabetic_Yes,smoker_Yes,region_northwest,region_southeast,region_southwest
0,0.523810,0.314208,0.300000,0.2,0.0,1.0,0.0,0.0,0.0,1.0
1,0.714286,0.721311,0.200000,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.976190,0.478142,0.283333,0.4,0.0,0.0,0.0,1.0,0.0,0.0
3,0.190476,0.311475,0.200000,0.6,1.0,0.0,0.0,0.0,0.0,0.0
4,0.928571,0.289617,0.050000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1067,0.476190,0.480874,0.000000,0.2,1.0,1.0,0.0,1.0,0.0,0.0
1068,0.357143,0.431694,0.250000,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1069,0.380952,0.571038,0.850000,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1070,0.857143,0.508197,0.050000,0.2,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
X_train.to_csv("../processed/xtrain.csv", index=False)
X_test.to_csv("../processed/xtest.csv", index=False)

y_train.to_csv("../processed/ytrain.csv",index=False)
y_test.to_csv("../processed/ytest.csv", index=False)

In [40]:
joblib.dump(scaler, '../models/minmax_scaler.joblib')

['../models/minmax_scaler.joblib']