In [339]:
import os
import glob

import pandas as pd



# from sklearn.linear_model import Lasso, LassoCV, LogisticRegressionCV, LogisticRegression
# from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import r2_score, explained_variance_score, normalized_mutual_info_score, mutual_info_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.dummy import DummyRegressor
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
# from sklearn.ensemble import IsolationForest

from keras.losses import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as K
from keras import regularizers
from keras import optimizers
# from keras.layers import Lambda

from hyperopt import Trials, fmin, tpe, hp, STATUS_OK

from mlxtend.plotting import plot_learning_curves
from mlxtend.data import iris_data
from mlxtend.preprocessing import shuffle_arrays_unison

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pkl
import datetime

import warnings
warnings.filterwarnings('ignore')
from pylab import rcParams
rcParams['figure.figsize'] = 8, 8

Custom model implementations and functions are stored in `src/dairyml.py`

In [2]:
from dairyml import PerfectClassifierMeanRegressor, plot_r2, BoundedLasso, BoundedLassoPlusLogReg, plot_coefficients

## Import the Data
Load the data from the pickle files created in `preproccess.ipynb`

In [3]:
with open("../pkl/data/data_outliers_removed", "rb" ) as f:
    [X, Y] = pkl.load(f)

## Modelling with Feed-Forward Neural Network (FFNN)

We will use the below splitter for cross-validation: 10 folds, with shuffling

In [292]:
splitter= KFold(n_splits=5,shuffle=True,random_state=7)

#### define r^2 metric for keras model

In [447]:
def r2_keras(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

### Function to build model

In [297]:
def create_model(hidden_layers,num_nodes,alpha=0.01,lr=.001): 
    
    reg = regularizers.l2(alpha)
    
    model = Sequential()
    
    model.add(Dense(num_nodes, input_dim=X.shape[1], activation='relu', kernel_regularizer = reg))
    
    for i in range(0,hidden_layers-1):
        model.add(Dense(num_nodes, activation='relu', kernel_regularizer = reg))
        
    model.add(Dense(1, activation='linear'))
    
    adam = optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    
    #add r2 and other metrics
    model.compile(loss='mean_squared_error', optimizer=adam,metrics=['mean_absolute_error',r2_keras])
    return model

### Use r2 scoring in cv

In [451]:
scoring = {'r2':make_scorer(r2_score), 
           'SRC':make_scorer(spearman), 
           'PCC':make_scorer(pearson), 
           'MI':make_scorer(mutual_info_score), 
           'MAE':make_scorer(mean_absolute_error)}

In [464]:
try:
    overall_results = pd.read_csv('../reports/model_results.csv',index_col=0)
except FileNotFoundError:
    overall_results = pd.DataFrame(columns = scoring.keys())

### Plot learning curves for train test split to estimate appropriate number of epochs

In [299]:
# model = create_model(1,60)

# # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=7, shuffle=True)
# # plot_learning_curves(X_train, y_train, X_test, y_test, model, scoring='r2')

# X_shuf, Y_shuf = shuffle(X,Y,random_state=7)


# history = model.fit(X_shuf, Y_shuf, validation_split=0.2, epochs=200, batch_size=10, verbose=0)
# # list all data in history
# print(history.history.keys())


# plt.plot(history.history['r2_keras'])
# plt.plot(history.history['val_r2_keras'])
# plt.title('R2')
# plt.ylabel('R2')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.ylim(0,1)
# plt.show()

# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

### Define objective function (1-r2) for hyperopt

In [452]:
EPOCHS = 100

def objective(params):
    hidden_layers = int(params['hidden_layers'])
    num_nodes = int(params['num_nodes'])
    alpha = params['alpha']
    lr = params['lr']
    
    # Print configuration
    print('hidden_layers: {}'.format(hidden_layers))
    print('num_nodes: {}'.format(num_nodes))
    print('alpha: {}'.format(alpha))
    print('lr: {}'.format(lr))
    
    # build keras model with given configuration
    model = KerasRegressor(build_fn=create_model,
                           hidden_layers=hidden_layers,
                           num_nodes=num_nodes,
                           alpha=alpha,
                           lr=lr,
                           epochs=EPOCHS, 
                           batch_size=5, 
                           verbose=0)
    # get cv results
    results = cross_validate(model,X,Y,cv=splitter,scoring=scoring)
    
    # get average r2 from cv
    r2 = np.mean(results['test_r2'])
    
    # convert r2 to a loss
    loss = 1 - r2
    
    # print r2 result
    print('R^2: {}'.format(r2))
    print('\n')
    
    # return loss
    return {'loss': loss, 'params': params, 'cv_results': results, 'status': STATUS_OK}

### Define the parameter space

min, max, stepsize

In [310]:
space = {
            'hidden_layers': hp.quniform('hidden_layers', 1, 2, 1),
            'num_nodes': hp.qloguniform('num_nodes', np.log(3), np.log(100), 1),
            'lr': hp.loguniform('lr', np.log(1e-4), np.log(1e-2)),
            'alpha': hp.loguniform('alpha', np.log(1e-4), np.log(1e-1))
        }

### Trials object to store results

In [311]:
trials = Trials()

### Run hyperopt

In [312]:
MAX_EVALS = 20

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = trials)

hidden_layers: 1
num_nodes: 3
alpha: 0.011919348950814248
lr: 0.008696534757119328
R^2: 0.5416689502982143


hidden_layers: 2
num_nodes: 3
alpha: 0.00020897271956071595
lr: 0.0008356388496771893
R^2: 0.6244959023328084


hidden_layers: 1
num_nodes: 40
alpha: 0.014550696296277088
lr: 0.0013488799122571282
R^2: 0.6273493122402498


hidden_layers: 2
num_nodes: 15
alpha: 0.00016259008566805636
lr: 0.00303136585715929
R^2: 0.670933823242754


hidden_layers: 1
num_nodes: 14
alpha: 0.08507975722250913
lr: 0.0003230701818184669
R^2: 0.5537815767459113


hidden_layers: 2
num_nodes: 11
alpha: 0.0038428763695687565
lr: 0.0002261146652768974
R^2: 0.6666974220239805


hidden_layers: 2
num_nodes: 69
alpha: 0.001217427134968658
lr: 0.0006437673256730367
R^2: 0.6891895200319381


hidden_layers: 1
num_nodes: 17
alpha: 0.027634252171355827
lr: 0.009623762199187966
R^2: 0.6086432254029224


hidden_layers: 2
num_nodes: 45
alpha: 0.04684415929397044
lr: 0.002343926589674816
R^2: 0.658396081868497


hidden_

In [441]:
def save_params(params):
    save_string = 'FFNN_best_params-' + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M %p"))
    params_dir = "../pkl/params/FFNN/"
    
    try:
        os.makedirs(params_dir)
    except FileExistsError:
        pass
    
    with open(params_dir + save_string, "wb" ) as f:
        f.seek(0)
        pkl.dump(params,f)

def load_params():
    params_dir = "../pkl/params/FFNN/*"
    list_of_files = glob.glob(params_dir)
    latest_file = max(list_of_files, key=os.path.getctime)
    print('loading {}'.format(latest_file))
    
    with open(latest_file, "rb") as f:
        f.seek(0)
        params = pkl.load(f)
        
    return params

In [442]:
def save_trials(trials):
    save_string = 'FFNN_trials-' + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M %p"))
    trials_dir = '../pkl/trials/FFNN/'
    
    try:
        os.makedirs(trials_dir)
    except FileExistsError:
        pass
    
    with open(trials_dir + save_string, "wb" ) as f:
        f.seek(0)
        pkl.dump(trials,f)
        
    print('saved to {}'.format(trials_dir + save_string))

def load_trials():
    trials_dir = "../pkl/trials/FFNN/*"
    list_of_files = glob.glob(trials_dir)
    latest_file = max(list_of_files, key=os.path.getctime)
    print('loading {}'.format(latest_file))
    
    with open(latest_file, "rb") as f:
        f.seek(0)
        trials = pkl.load(f)
        
    return trials

In [444]:
save_params(best)
load_params()

loading ../pkl/params/FFNN\FFNN_best_params-2019-01-21 12-43 PM


{'alpha': 0.005660851233300683,
 'hidden_layers': 1.0,
 'lr': 0.001994170783323986,
 'num_nodes': 59.0}

In [445]:
save_trials(trials)
load_trials()

saved to ../pkl/trials/FFNN/FFNN_trials-2019-01-21 12-44 PM
loading ../pkl/trials/FFNN\FFNN_trials-2019-01-21 12-44 PM


<hyperopt.base.Trials at 0xb992c240>

Trials so far were conducted with only r2 in the scoring dict, so I added the other metrics then re-ran with what was supposedly the best configuration. This achieved .71 r^2 above, but only .65 below. This is not very consistent at all, maybe need to increase the number of folds in CV, or increase number of epochs. Also thinking about setting number of epochs much higher and just using early stopping to get to the appropriate number. 

In [456]:
# final_model_results = objective(trials.best_trial['result']['params'])

hidden_layers: 1
num_nodes: 59
alpha: 0.005660851233300683
lr: 0.001994170783323986
R^2: 0.655077183043743




In [466]:
for score_name in scoring.keys():
    overall_results.loc['FFNN',score_name] = np.round(np.mean(final_model_results['cv_results']['test_'+score_name]),2)
overall_results

Unnamed: 0,r2,SRC,PCC,MI,MAE
Dummy Mean,-0.02,0.0,-0.0,-0.0,1.94
Dummy Median All,-0.32,0.0,-0.0,-0.0,1.68
Dummy Median Nonzero,-0.08,0.0,-0.0,-0.0,1.77
"Perfect Clasif., Mean Regr.",0.13,0.73,0.41,0.53,1.53
Lasso,0.45,0.61,0.7,3.07,1.23
Bounded Lasso,0.55,0.64,0.75,2.87,1.08
Bounded Lasso + LogReg,0.64,0.8,0.82,2.66,0.86
FFNN,0.66,0.7,0.83,3.58,0.96


In [467]:
overall_results.to_csv('../reports/model_results.csv')