In [22]:
import json
import os
import sys
import warnings

import numpy as np
import pandas as pd
from datetime import datetime
from pprint import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer as Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

In [23]:
sys.path.append(os.path.join('..', 'src'))

In [24]:
import importlib
import model
importlib.reload(model)

from model import get_model_params, timer, measure_prediction_time, apply_ml_model, save_model_parameters, save_model_metrics

# set model parameters and capture data

In [25]:
inputs = os.path.join('..', 'data', '03_processed')
models_reports = os.path.join('..', 'data', '04_models')
model_outputs = os.path.join('..', 'data', '05_model_output')
reports = os.path.join('..', 'data', '06_reporting')

X_train            = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
X_train_onehot         = pd.read_csv(os.path.join(inputs, 'X_train_onehot.csv'), index_col='id')
y_train            = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id')

data_list = [X_train, X_train_onehot, y_train]

for df in data_list:
    print(df.shape)

(354, 14)
(354, 14)
(354, 1)


# Machine Learning

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [27]:
model_type = 'tree_rf'
ml_dict = {}

In [28]:
columns = X_train.columns.to_list()

In [29]:
scoring = 'neg_mean_squared_error'

# Specify the hyperparameter space
parameters = {'model__max_features':[1, 2, "auto", "log2", None],
              'model__n_estimators':[100, 200, 300],
              'model__n_estimators':[10], # it allows faster tests on pipeline
              'model__random_state':[42]}

ml_model = RandomForestRegressor()
do_transform_label = None

ml_dict[model_type] = {}
clf, ml_dict[model_type]['train_time'], ml_dict[model_type]['prediction_time'] = apply_ml_model(
    X_train, y_train, columns, ml_model, parameters, scoring, 
    do_build_polynomals=False, do_transform_label=do_transform_label, 
    do_treat_skewness=False,
    imputation=Imputer(strategy='median'), scaler=None, smote=False,
    testing=True)

ml_dict[model_type]['best_params'], ml_dict[model_type]['best_score']  = get_model_params(clf, scoring)
ml_dict[model_type]['columns'] = columns

test type: True
(354, 14) (354, 14)


In [30]:
print('RESULTS FOR TREE MODEL')
pprint(ml_dict)

RESULTS FOR TREE MODEL
{'tree_rf': {'best_params': {'model__max_features': 'auto',
                             'model__n_estimators': 10,
                             'model__random_state': 42},
             'best_score': 10.936919215291752,
             'columns': ['crim',
                         'zn',
                         'indus',
                         'chas',
                         'nox',
                         'rm',
                         'age',
                         'dis',
                         'rad',
                         'tax',
                         'ptratio',
                         'b',
                         'lstat',
                         'if_anomaly'],
             'prediction_time': 0.0003,
             'train_time': 0.819009}}


# save model parameters and metrics

In [31]:
save_model_parameters(models_reports, model_type, clf)
save_model_metrics(model_outputs, model_type, ml_dict)