In [4]:
import json
import os
import sys
import warnings

import numpy as np
import pandas as pd
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [5]:
sys.path.append(os.path.join('..', 'src'))

In [6]:
import importlib
import utils, model
importlib.reload(utils)
importlib.reload(model)

from utils import capture_data, build_data_dict
from model import timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics
from model import get_folders

import params
importlib.reload(params)
from params import ProjectParameters

# set model parameters and capture data

In [7]:
target_type = ProjectParameters().target_type
scoring = ProjectParameters().scoring
print('SELECTED TARGET AND SCORING ARE:', target_type, ',', scoring)

inputs, models_reports, model_outputs, reports = get_folders()

data_list = ['X_train', 'X_train_oh', 'X_train_oh_nocol', 'y_train']
dfs_dict = build_data_dict(inputs, data_list)

for df in data_list:
    print(dfs_dict[df].shape)

SELECTED TARGET AND SCORING ARE: binary , accuracy
loading data into dictionary
(32534, 13)
(32534, 96)
(32534, 88)
(32534, 1)


In [8]:
dfs_dict['y_train'] = dfs_dict['y_train']['y'].to_list()

In [9]:
dfs_dict['X_train'].head()

Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,2174.0,0.0,40.0,0.0
1,50.0,1.0,0.0,13.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,13.0,0.0
2,38.0,2.0,1.0,9.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0
3,53.0,2.0,2.0,7.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,40.0,0.0
4,28.0,2.0,0.0,13.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,40.0,1.0


# Machine Learning

convergence warning: https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge

In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

In [11]:
model_type = 'reg'
ml_dict = {}

In [12]:
def get_model_params(classifier):
    if target_type == 'regression':
        classifier.best_score_ = -classifier.best_score_
    return classifier.best_params_, classifier.best_score_

In [13]:
# Specify the hyperparameter space
if target_type == 'regression':
    parameters = {
    'model__alpha': np.linspace(0.2, 1, 5), 
    'model__l1_ratio': np.linspace(0, 1, 5),
    'model__random_state':[42]
    }
    ml_model = ElasticNet()
    # set tol, default is 1e-4
    do_transform_label = 'log'
elif target_type == 'binary':
    c_space = np.logspace(-5, 1, 5)
    parameters = {
    'model__C': c_space, 
    'model__penalty': ['l2'],
    'model__random_state':[42]
    }
    ml_model = LogisticRegression()
    do_transform_label = None

df_x = dfs_dict['X_train']
df_y = dfs_dict['y_train']
key = 'standard'
    
print('running with key:', key)
ml_dict[key] = {model_type: {}}

running with key: standard


### test with different preprocessing steps

treat_collinearity = False, do_build_polynomals=True, do_treat_skewness=True, smote=True

In [14]:
clf, ml_dict[key][model_type]['train_time'], ml_dict[key][model_type]['prediction_time'] = apply_ml_model(
    dfs_dict, df_y, ml_model, parameters, scoring,
    encoding='one-hot', treat_collinearity = False, do_build_polynomals=False, 
    do_treat_skewness=True,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    smote=True, testing=False
    )
ml_dict[key][model_type]['best_params'], ml_dict[key][model_type]['best_score']  = get_model_params(clf)
pprint(ml_dict)

test type: False
{'standard': {'reg': {'best_params': {'model__C': 10.0,
                                      'model__penalty': 'l2',
                                      'model__random_state': 42},
                      'best_score': 0.8016538598669616,
                      'prediction_time': 0.0001999,
                      'train_time': 173.713724}}}


treat_collinearity = False, do_build_polynomals=True, do_treat_skewness=False,

In [15]:
clf, ml_dict[key][model_type]['train_time'], ml_dict[key][model_type]['prediction_time'] = apply_ml_model(
    dfs_dict, df_y, ml_model, parameters, scoring,
    encoding='one-hot', treat_collinearity = False, do_build_polynomals=True, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    smote=False
    )
ml_dict[key][model_type]['best_params'], ml_dict[key][model_type]['best_score']  = get_model_params(clf)
pprint(ml_dict)

test type: False
number of columns before building polynomials: 96
number of columns after building polynomials: 20
{'standard': {'reg': {'best_params': {'model__C': 10.0,
                                      'model__penalty': 'l2',
                                      'model__random_state': 42},
                      'best_score': 0.8258745541643039,
                      'prediction_time': 0.0002999,
                      'train_time': 5.404997}}}


treat_collinearity = False, do_build_polynomals=False, do_treat_skewness=False,

In [16]:
clf, ml_dict[key][model_type]['train_time'], ml_dict[key][model_type]['prediction_time'] = apply_ml_model(
    dfs_dict, df_y, ml_model, parameters, scoring,
    encoding='one-hot', treat_collinearity = False, do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[key][model_type]['best_params'], ml_dict[key][model_type]['best_score']  = get_model_params(clf)
pprint(ml_dict)

test type: False
{'standard': {'reg': {'best_params': {'model__C': 10.0,
                                      'model__penalty': 'l2',
                                      'model__random_state': 42},
                      'best_score': 0.8435483771148393,
                      'prediction_time': 0.0002997,
                      'train_time': 15.293}}}


treat_collinearity = True, do_build_polynomals=False, do_treat_skewness=False,

In [17]:
clf, ml_dict[key][model_type]['train_time'], ml_dict[key][model_type]['prediction_time'] = apply_ml_model(
    dfs_dict, df_y, ml_model, parameters, scoring,
    encoding='one-hot', treat_collinearity = True, do_build_polynomals=False, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(),
    )
ml_dict[key][model_type]['best_params'], ml_dict[key][model_type]['best_score']  = get_model_params(clf)
pprint(ml_dict)

test type: False
{'standard': {'reg': {'best_params': {'model__C': 10.0,
                                      'model__penalty': 'l2',
                                      'model__random_state': 42},
                      'best_score': 0.8474519743239458,
                      'prediction_time': 0.0003001,
                      'train_time': 13.172997}}}


In [18]:
print('RESULTS FOR LINEAR MODEL')
pprint(ml_dict)

RESULTS FOR LINEAR MODEL
{'standard': {'reg': {'best_params': {'model__C': 10.0,
                                      'model__penalty': 'l2',
                                      'model__random_state': 42},
                      'best_score': 0.8474519743239458,
                      'prediction_time': 0.0003001,
                      'train_time': 13.172997}}}


# save model parameters and metrics

In [19]:
save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)

# tests on pipeline

In [20]:
#imports neccessary for those tests
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
from scipy.special import boxcox1p, logit
from scipy.stats import norm, skew