In [39]:
import json
import os
import sys
import warnings

import numpy as np
import pandas as pd
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [40]:
sys.path.append(os.path.join('..', 'src'))

In [41]:
import importlib
import utils, model
importlib.reload(utils)
importlib.reload(model)

from utils import capture_data, build_data_dict
from model import timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics
from model import get_folders

import params
importlib.reload(params)
from params import ProjectParameters

# set model parameters and capture data

In [42]:
target_type = ProjectParameters().target_type
scoring = ProjectParameters().scoring
print('SELECTED TARGET AND SCORING ARE:', target_type, ',', scoring)

inputs, models_reports, model_outputs, reports = get_folders()

data_list = ['X_train', 'y_train']
dfs_dict = build_data_dict(inputs, data_list)

for df in data_list:
    print(dfs_dict[df].shape)

SELECTED TARGET AND SCORING ARE: binary , accuracy
loading data into dictionary
(32534, 13)
(32534, 1)


In [43]:
dfs_dict['y_train'] = dfs_dict['y_train']['y'].to_list()

In [44]:
len(dfs_dict['y_train'])

32534

# Machine Learning

In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [47]:
model_type = 'tree_randomforest'
ml_dict = {}

In [48]:
def get_model_params(classifier):
    if target_type == 'regression':
        classifier.best_score_ = -classifier.best_score_
    return classifier.best_params_, classifier.best_score_

In [49]:
dfs_dict['X_train'].head()

Unnamed: 0_level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,2174.0,0.0,40.0,0.0
1,50.0,1.0,0.0,13.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,13.0,0.0
2,38.0,2.0,1.0,9.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0
3,53.0,2.0,2.0,7.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,40.0,0.0
4,28.0,2.0,0.0,13.0,1.0,3.0,2.0,1.0,1.0,0.0,0.0,40.0,1.0


In [50]:
print(datetime.now())
# Specify the hyperparameter space
parameters = {'model__max_features':[1, 2, "auto", "log2", None],
              'model__n_estimators':[100, 200, 300],
#               'model__n_estimators':[10], # it allows faster tests on pipeline
              'model__random_state':[42]}

if target_type == 'regression':
    ml_model = RandomForestRegressor()
    # set tol, default is 1e-4
    do_transform_label = 'log'
elif target_type == 'binary':
    ml_model = RandomForestClassifier()
    do_transform_label = None
    
df_x = dfs_dict['X_train']
df_y = dfs_dict['y_train']
key = 'standard'
print('running with key:', key)
ml_dict[key] = {model_type: {}}
clf, ml_dict[key][model_type]['train_time'], ml_dict[key][model_type]['prediction_time'] = apply_ml_model(
    dfs_dict, df_y, ml_model, parameters, scoring, 
    encoding='ordinal', treat_collinearity = False, do_build_polynomals=True, do_transform_label=do_transform_label, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=StandardScaler(), smote=False,
    testing=True)

ml_dict[key][model_type]['best_params'], ml_dict[key][model_type]['best_score']  = get_model_params(clf)

2021-05-21 17:39:31.711866
running with key: standard
test type: True
number of columns before building polynomials: 13
number of columns after building polynomials: 20
(32534, 20) (32534, 20)


In [51]:
print('RESULTS FOR TREE MODEL')
pprint(ml_dict)

RESULTS FOR TREE MODEL
{'standard': {'tree_randomforest': {'best_params': {'model__max_features': None,
                                                    'model__n_estimators': 200,
                                                    'model__random_state': 42},
                                    'best_score': 0.8215713589153747,
                                    'prediction_time': 0.0018994,
                                    'train_time': 720.936488}}}


# save model parameters and metrics

In [52]:
save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)