## Model Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [71]:
insurance = pd.read_csv('./expenses.csv')

In [72]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [73]:
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']

X = insurance[features]

In [74]:
y = insurance['charges']

y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [75]:
#split data 75/25, random state = 0 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25,
                                                    random_state = 0
                                                   )
np.sum(y_test)

4500314.176879

In [76]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [77]:
#Machine learning pipeline

#Age, BMI, Childern - std 
#sex, smoker region - onehot encoding & dummy variable

In [78]:
#sex, smoker region
imp_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                       OneHotEncoder(sparse=False))
#Age, BMI, Childern
imp_std = make_pipeline(SimpleImputer(), 
                       StandardScaler())

In [79]:
preprocessor = make_column_transformer((imp_ohe, ['sex', 'smoker', 'region']), 
                       (imp_std, ['age', 'bmi', 'children']), 
                       remainder = 'passthrough')

In [80]:
np.set_printoptions(edgeitems = 6, suppress = True) 
# onehot encoding ('sex', 'smoker', 'region') /standardization (age', 'bmi', 'children') 

preprocessor.fit_transform(X_train).round(3)

array([[ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -0.515, -0.181, -0.064],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         1.549, -1.393, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
        -1.44 , -0.982, -0.064],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -1.369, -1.011, -0.892],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,
        -0.942, -1.363, -0.892],
       [ 0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         1.051, -0.902,  1.593],
       ...,
       [ 0.   ,  1.   ,  0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -1.511, -1.469, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -0.871, -0.753, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.197,  0.843,  0.765],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.

In [81]:
preprocessor.fit_transform(X_train).round(3)

array([[ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -0.515, -0.181, -0.064],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         1.549, -1.393, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
        -1.44 , -0.982, -0.064],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,
        -1.369, -1.011, -0.892],
       [ 1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,
        -0.942, -1.363, -0.892],
       [ 0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         1.051, -0.902,  1.593],
       ...,
       [ 0.   ,  1.   ,  0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -1.511, -1.469, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
        -0.871, -0.753, -0.892],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.197,  0.843,  0.765],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.

In [82]:
from sklearn.linear_model import ElasticNet
Elsnet = ElasticNet(alpha = 0.1, random_state = 0)

In [83]:
pipe = make_pipeline(preprocessor, Elsnet) 
pipe.fit(X_train, y_train)

  self._df_columns[remainder_columns].tolist()


In [85]:
y_pred = pipe.predict(X_test)

In [86]:
from sklearn import set_config
set_config(display='diagram')

pipe

  self._df_columns[remainder_columns].tolist()


## Performance when alpha = 0.1 without Cross-Validation

In [87]:
#R-square
from sklearn.metrics import r2_score
r2_score(y_test, y_pred).round(3)

0.778

In [88]:
import math

In [93]:
#RMSE
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
(math.sqrt(mse)).round(2)

AttributeError: 'float' object has no attribute 'round'

In [19]:
#MAPE 
(np.mean(np.abs((y_test - y_pred)/y_test))*100).round(3)

48.574

In [20]:
#PR 
sum(y_pred)/sum(y_test)

0.9988838507351012

## Performance when alpha changes, without Cross-Validation

In [21]:
#Change alpha from 0 to 1 (0, 0.1, 0.2, 0.3, ... 1.0)
from sklearn.model_selection import cross_val_score

In [22]:
for k in np.arange(0, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    y_pred = pipe.predict(X_test)
    rsquare = r2_score(y_test, y_pred).round(3)
    rmse = mean_squared_error(y_test, y_pred).round(3)
    mape = (np.mean(np.abs((y_test - y_pred)/y_test))*100).round(3)
    pr = (sum(y_pred)/sum(y_test)).round(3)
    
    
    print('alpha =',k.round(2) , '; R-square:', rsquare, '; RMSE:', rmse, '; MAPE:', mape, '; Prediction ratio:', pr)

alpha = 0.0 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.1 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.2 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.3 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.4 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.5 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.6 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.7 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.8 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 0.9 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction ratio: 0.999
alpha = 1.0 ; R-square: 0.778 ; RMSE: 34976507.497 ; MAPE: 48.574 ; Prediction r

## Performance when alpha changes and Cross-Validation = 10

In [23]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, shuffle=True, random_state = 0)


In [24]:
for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k,l1_ratio = ,  random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    score = cross_val_score(pipe, X, y, cv=cv, scoring='r2').mean().round(3)
    
    print('k=',k.round(3), ';mean R-squared:', score.mean().round(3))

k= 0.1 ;mean R-squared: 0.727
k= 0.2 ;mean R-squared: 0.704
k= 0.3 ;mean R-squared: 0.676
k= 0.4 ;mean R-squared: 0.647
k= 0.5 ;mean R-squared: 0.618
k= 0.6 ;mean R-squared: 0.59
k= 0.7 ;mean R-squared: 0.565
k= 0.8 ;mean R-squared: 0.541
k= 0.9 ;mean R-squared: 0.518
k= 1.0 ;mean R-squared: 0.498


In [64]:
for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    score = cross_val_score(pipe, X, y, cv=cv, scoring='neg_root_mean_squared_error').mean().round(3)
    
    print('k=',k.round(3), ';mean RMSE:', score.mean().round(3))

k= 0.1 ;mean RMSE: -6213.312
k= 0.2 ;mean RMSE: -6490.413
k= 0.3 ;mean RMSE: -6804.749
k= 0.4 ;mean RMSE: -7113.987
k= 0.5 ;mean RMSE: -7402.698
k= 0.6 ;mean RMSE: -7666.514
k= 0.7 ;mean RMSE: -7905.596
k= 0.8 ;mean RMSE: -8121.765
k= 0.9 ;mean RMSE: -8317.377
k= 1.0 ;mean RMSE: -8494.789


In [51]:
#MAPE
from sklearn.model_selection import cross_val_predict

for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    y_pred = cross_val_predict(pipe, X, y, cv=cv)

    MAPE = (np.mean(np.abs((y - y_pred)/y_test))*100).round(3)
    #print(y_pred)
    #print(y)
    print('k=',k.round(3), ';MAPE:', MAPE.round(3))

k= 0.1 ;MAPE: 47.412
k= 0.2 ;MAPE: 54.549
k= 0.3 ;MAPE: 61.471
k= 0.4 ;MAPE: 67.571
k= 0.5 ;MAPE: 72.771
k= 0.6 ;MAPE: 77.29
k= 0.7 ;MAPE: 81.307
k= 0.8 ;MAPE: 84.896
k= 0.9 ;MAPE: 88.136
k= 1.0 ;MAPE: 91.064


In [45]:
#PR 

for k in np.arange(0.1, 1.01, 0.1): 
    Elsnet = ElasticNet(alpha = k, random_state = 0)
    pipe = make_pipeline(preprocessor, Elsnet) 
    y_pred = cross_val_predict(pipe, X, y, cv=cv)
    PR = (sum(y_pred)/sum(y)).round(3)
  
    print('k=',k.round(3), ';PR:', PR.round(3))

k= 0.1 ;PR: 1.0
k= 0.2 ;PR: 1.0
k= 0.3 ;PR: 1.0
k= 0.4 ;PR: 1.0
k= 0.5 ;PR: 1.0
k= 0.6 ;PR: 1.0
k= 0.7 ;PR: 1.0
k= 0.8 ;PR: 1.0
k= 0.9 ;PR: 1.0
k= 1.0 ;PR: 1.0


## Performance when alpha, lambda changes and Cross-Validation = 10

In [57]:
#selecting the best model 
from sklearn.model_selection import GridSearchCV 

In [58]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(sparse=False))]),
                                    ['sex', 'smoker', 'region']),
                                   ('pipeline-2',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['age', 'bmi', 'children'])])),
  ('elasticnet', ElasticNet(random_state=0))],
 'verbose': False

In [60]:
param_grid = { 'elasticnet__alpha': arange(0, 1, 0.1),
              'elasticnet__l1_ratio': arange(0, 1, 0.01)
}

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99])

In [65]:
#RMSE
gs = GridSearchCV(pipe, param_grid = param_grid, cv = cv, scoring='neg_root_mean_squared_error', n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)

{'elasticnet__alpha': 0.0, 'elasticnet__l1_ratio': 0.0}
-6202.206619992716


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [70]:
#R2
gs = GridSearchCV(pipe, param_grid = param_grid, cv = cv, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)

{'elasticnet__alpha': 0.30000000000000004, 'elasticnet__l1_ratio': 0.99}
0.7186805826249072


In [None]:
#MAPE
gs = GridSearchCV(pipe, param_grid = param_grid, cv = cv, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)

In [None]:
#PR
gs = GridSearchCV(pipe, param_grid = param_grid, cv = cv, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)