In [20]:
import json
import os
import sys
import warnings

import numpy as np
import pandas as pd
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [21]:
sys.path.append(os.path.join('..', 'src'))

In [22]:
import importlib
import model
importlib.reload(model)

from model import get_model_params, timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics

# set model parameters and capture data

In [23]:
inputs = os.path.join('..', 'data', '03_processed')
models_reports = os.path.join('..', 'data', '04_models')
model_outputs = os.path.join('..', 'data', '05_model_output')
reports = os.path.join('..', 'data', '06_reporting')

X_train            = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
X_train_onehot         = pd.read_csv(os.path.join(inputs, 'X_train_onehot.csv'), index_col='id')
y_train            = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id')

data_list = [X_train, X_train_onehot, y_train]

for df in data_list:
    print(df.shape)

(354, 14)
(354, 14)
(354, 1)


In [24]:
X_train_onehot.head()

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,if_anomaly
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,1
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,1
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,1
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,1
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,1


# Machine Learning

convergence warning: https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge

In [25]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

In [26]:
ml_dict = {}
scoring = 'neg_mean_squared_error'

# Specify the hyperparameter space
parameters = {
'model__alpha': np.linspace(0.2, 1, 5), 
'model__l1_ratio': np.linspace(0, 1, 5),
'model__random_state':[42]
}
ml_model = ElasticNet()
# set tol, default is 1e-4
do_transform_label = 'log'
c_space = np.logspace(-5, 1, 5)
do_transform_label = None

## test with different preprocessing steps
There are 2 different X_sets: On X_train_onehot, I applied one-hot encoding, while on X_train I applied Ordinal Encoding. The former is aimed at linear regression models, and the later is generally used for tree models.

On 'column' parameter, I am able to choose column groups. For instance, I might exclude collinear variables obtained from the VIF function applied on notebook 5. That is useful for linear regression models.

```python
treat_collinearity = False, do_build_polynomals=False, do_treat_skewness=False
```

In [27]:
model_type = 'reg'
ml_dict[model_type] = {}
columns = X_train_onehot.columns

clf, ml_dict[model_type]['train_time'], ml_dict[model_type]['prediction_time'] = apply_ml_model(
    X_train_onehot, y_train, columns, ml_model, parameters, scoring,
    do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[model_type]['best_params'], ml_dict[model_type]['best_score']  = get_model_params(clf, scoring)
pprint(ml_dict)

save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)

test type: False
{'reg': {'best_params': {'model__alpha': 0.2,
                         'model__l1_ratio': 0.75,
                         'model__random_state': 42},
         'best_score': 12.325632143994682,
         'prediction_time': 0.0002005,
         'train_time': 1.704025}}


```python
treat_collinearity = True, do_build_polynomals=False, do_treat_skewness=False,
```

In [28]:
model_type = 'reg_nocol'
ml_dict[model_type] = {}

# columns_nocol = dfs_dict['X_train_oh_nocol'].columns.to_list()

clf, ml_dict[model_type]['train_time'], ml_dict[model_type]['prediction_time'] = apply_ml_model(
    X_train_onehot, y_train, columns, ml_model, parameters, scoring,
    do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[model_type]['best_params'], ml_dict[model_type]['best_score']  = get_model_params(clf, scoring)
pprint(ml_dict)

save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)

test type: False
{'reg': {'best_params': {'model__alpha': 0.2,
                         'model__l1_ratio': 0.75,
                         'model__random_state': 42},
         'best_score': 12.325632143994682,
         'prediction_time': 0.0002005,
         'train_time': 1.704025},
 'reg_nocol': {'best_params': {'model__alpha': 0.2,
                               'model__l1_ratio': 0.75,
                               'model__random_state': 42},
               'best_score': 12.325632143994682,
               'prediction_time': 0.0003001,
               'train_time': 1.940997}}


I might use the alternative encoding just to demonstrate the impact on the score.