## Model Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
insurance = pd.read_csv('./expenses.csv')

In [3]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']

X = insurance[features]

In [5]:
y = insurance['charges']

y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [6]:
#split data 75/25, random state = 0 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25,
                                                    random_state = 0
                                                   )
np.sum(y_test)

4500314.176879

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [8]:
#Machine learning pipeline

#Age, BMI, Childern - std 
#sex, smoker region - onehot encoding & dummy variable

In [9]:
#sex, smoker region
imp_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                       OneHotEncoder(sparse=False))
#Age, BMI, Childern
imp_std = make_pipeline(SimpleImputer(), 
                       StandardScaler())

In [10]:
preprocessor = make_column_transformer((imp_ohe, ['sex', 'smoker', 'region']), 
                       (imp_std, ['age', 'bmi', 'children']), 
                       remainder = 'passthrough')

In [11]:
np.set_printoptions(edgeitems = 6, suppress = True) 
# onehot encoding ('sex', 'smoker', 'region') /standardization (age', 'bmi', 'children') 

preprocessor.fit_transform(X_train).round(3)

array([[ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -0.515, -0.181, -0.064],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         1.549, -1.393, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
        -1.44 , -0.982, -0.064],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -1.369, -1.011, -0.892],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,
        -0.942, -1.363, -0.892],
       [ 0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         1.051, -0.902,  1.593],
       ...,
       [ 0.   ,  1.   ,  0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -1.511, -1.469, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -0.871, -0.753, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.197,  0.843,  0.765],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.

In [12]:
preprocessor.fit_transform(X_train).round(3)

array([[ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -0.515, -0.181, -0.064],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         1.549, -1.393, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
        -1.44 , -0.982, -0.064],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -1.369, -1.011, -0.892],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,
        -0.942, -1.363, -0.892],
       [ 0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         1.051, -0.902,  1.593],
       ...,
       [ 0.   ,  1.   ,  0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -1.511, -1.469, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -0.871, -0.753, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.197,  0.843,  0.765],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.

In [13]:
from sklearn.linear_model import ElasticNet
Elsnet = ElasticNet(alpha = 0.1, random_state = 0)

In [14]:
pipe = make_pipeline(preprocessor, Elsnet) 
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['sex', 'smoker', 'region']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                

In [15]:
y_pred = pipe.predict(X_test)

In [16]:
from sklearn import set_config
set_config(display='diagram')

pipe

  self._df_columns[remainder_columns].tolist()


## Performance when alpha = 0.1 without Cross-Validation

In [17]:
#R-square
from sklearn.metrics import r2_score
r2_score(y_test, y_pred).round(3)

0.778

In [18]:
#RMSE
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred).round(3)

34976507.497

In [19]:
#MAPE 
(np.mean(np.abs((y_test - y_pred)/y_test))*100).round(3)

48.574

In [20]:
#PR 
sum(y_pred)/sum(y_test)

0.9988838507351012

## Performance when alpha changes, without Cross-Validation

In [21]:
#Change alpha from 0 to 1 (0, 0.1, 0.2, 0.3, ... 1.0)
from sklearn.model_selection import cross_val_score

In [22]:
for k in np.arange(0, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    y_pred = pipe.predict(X_test)
    rsquare = r2_score(y_test, y_pred).round(3)
    rmse = mean_squared_error(y_test, y_pred).round(3)
    mape = (np.mean(np.abs((y_test - y_pred)/y_test))*100).round(3)
    pr = (sum(y_pred)/sum(y_test)).round(3)
    
    
    print('alpha =',k.round(2) , '; R-square:', rsquare, '; RMSE:', rmse, '; MAPE:', mape, '; Prediction ratio:', pr)

alpha = 0.0 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.1 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.2 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.3 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.4 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.5 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.6 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.7 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.8 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.9 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 1.0 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction r

## Performance when alpha changes and Cross-Validation = 10

In [23]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, shuffle=True, random_state = 0)


In [24]:
for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    score = cross_val_score(pipe, X, y, cv=cv, scoring='r2').mean().round(3)
    
    print('k=',k.round(3), ';mean R-squared:', score.mean().round(3))

k= 0.1 ;mean R-squared: 0.727
k= 0.2 ;mean R-squared: 0.704
k= 0.3 ;mean R-squared: 0.676
k= 0.4 ;mean R-squared: 0.647
k= 0.5 ;mean R-squared: 0.618
k= 0.6 ;mean R-squared: 0.59
k= 0.7 ;mean R-squared: 0.565
k= 0.8 ;mean R-squared: 0.541
k= 0.9 ;mean R-squared: 0.518
k= 1.0 ;mean R-squared: 0.498


In [25]:
for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    score = cross_val_score(pipe, X, y, cv=cv, scoring='neg_mean_squared_error').mean().round(3)
    
    print('k=',k.round(3), ';mean RMSE:', score.mean().round(3))

k= 0.1 ;mean RMSE: -38881165.144
k= 0.2 ;mean RMSE: -42492224.196
k= 0.3 ;mean RMSE: -46770879.186
k= 0.4 ;mean RMSE: -51165917.953
k= 0.5 ;mean RMSE: -55435369.962
k= 0.6 ;mean RMSE: -59477380.156
k= 0.7 ;mean RMSE: -63256857.119
k= 0.8 ;mean RMSE: -66769707.744
k= 0.9 ;mean RMSE: -70026921.686
k= 1.0 ;mean RMSE: -73045656.196


In [26]:
#MAPE

In [27]:
#PR 