# Exploratory

In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import bokeh as bk

In [2]:
ddf = dd.read_csv('data/insurance.csv',
    dtype={
        'charges': np.float64,
        'age': np.int64,
        'bmi': np.float64,
        'children': np.int32,
    })
ddf.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
ddf = ddf.drop('region', axis=1)
ddf['smoker'] = ddf.apply(lambda x: 1 if x['smoker'] == 'yes' else 0, axis=1, meta=('smoker', 'int64'))   
ddf['sex'] = ddf.apply(lambda x: 1 if x['sex'] == 'male' else 0, axis=1, meta=('sex', 'int64'))   

In [4]:
ddf.isnull().sum().compute()

age         0
sex         0
bmi         0
children    0
smoker      0
charges     0
dtype: int64

In [5]:
ddf.compute().to_csv('data/insurance_clean.csv', index=False)

In [6]:
ddf.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,0,27.9,0,1,16884.924
1,18,1,33.77,1,0,1725.5523
2,28,1,33.0,3,0,4449.462
3,33,1,22.705,0,0,21984.47061
4,32,1,28.88,0,0,3866.8552


# Prediction

In [7]:
from sklearn_ultimate import sklearn_models_ultimate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = ddf.compute()
x = df.drop('charges', axis=1)
y = df['charges']

models, score_table = sklearn_models_ultimate(x, y)
score_table

Best Mean Squared Error: Gradient Boosting
Best R2 Score: Gradient Boosting
Best Mean Absolute Error: Gradient Boosting
Best Mean Absolute Percentage Error: Gradient Boosting
Best Median Absolute Error: Decision Tree
Best Explained Variance Score: Gradient Boosting
Best Max error: Gradient Boosting


Unnamed: 0,Model,Mean Squared Error,R2 Score,Mean Absolute Error,Mean Absolute Percentage ErrorE,Median Absolute Error,Explained Variance Score,Max error
0,Linear Regression,33979260.0,0.78113,4213.484798,0.480356,2756.331638,0.781471,22819.888106
1,Decision Tree,46059850.0,0.703316,3246.95052,0.370064,547.969775,0.707077,22439.12182
2,Random Forest,21544010.0,0.861229,2473.659957,0.305269,938.82886,0.862133,21734.941039
3,Gradient Boosting,18858980.0,0.878524,2424.226885,0.3013,1424.752687,0.878821,20822.340745
4,Support Vector Machine,166462300.0,-0.07223,8590.133629,1.118548,5326.40632,0.004651,54164.480807
5,K Neighbors,124634000.0,0.197198,7418.718703,0.866796,4872.375425,0.207486,53044.44659


In [8]:
# Save best model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import pickle

with open('models/gradient_boosting.pkl', 'wb') as f:
    pickle.dump(models['Gradient Boosting'], f)