In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

# Set font for all plots to Liberation Sans
plt.rcParams["font.family"] = "Liberation Sans"

# Mean Squared Error for Permutation Feature Importance
def MeSqEr(y, y_hat):
    n = y.shape[0]
    SE = np.zeros_like(y)
    for j in range(0, n):
            SE[j] = (y[j] - y_hat[j]) ** 2

    return np.sum(SE) / n

In [28]:
db = pd.read_csv('db.csv') # Load csv as a DataFrame
labels = list(db.columns)[3:] # Store the names of the features as a list

# Split Training-Set and Test-Set (75%/25%)
X_train, X_test, y_train, y_test = train_test_split(db[labels], db['y'], 
                                    test_size = .25, random_state = 42)

# Normalize Training-Set and store mean and SD for each of the variables
X_train_n, MU, SIGMA = (X_train-X_train.mean())/X_train.std(), X_train.mean(), X_train.std() 

# Normalize Test-Set according to the mean and SD computed for the Training-Set
X_test_n = (X_test-MU) / SIGMA

print('Initial Dataset: %.0f jumps, %.0f features' % (db.shape[0], len(labels)))
print('Train-Set: %.0f jumps' % (X_train_n.shape[0]))
print('Test-Set: %.0f jumps' % (X_test_n.shape[0]))

Initial Dataset: 172 jumps, 26 features
Train-Set: 129 jumps
Test-Set: 43 jumps


In [29]:
# Lasso
from sklearn.linear_model import Lasso
from beautifultable import BeautifulTable
import scipy.stats as stats

lasso_model = Lasso(alpha=.1)
lasso_model.fit(X_train_n, y_train)
coefs_lasso = list(np.round(lasso_model.coef_, 2))
lasso_betas = pd.DataFrame(np.round(lasso_model.coef_,3), columns=["Beta"])
lasso_betas.index = labels
lasso_betas = lasso_betas[lasso_betas.Beta != 0] # Remove deleted variables
labels_lasso = list(lasso_betas.index) # Store the labels of the non null features

lasso_betas

Unnamed: 0,Beta
h,2.937
b,0.568
e,2.558
F,1.276
G,1.147
H,0.332
i,-0.665
J,1.053
l,-0.175
q,0.603


In [30]:
# Store in memory the new Train and Test set with the Lasso-selected variables only
X_train_new = X_train_n[labels_lasso]
X_test_new = X_test_n[labels_lasso]

In [10]:
# Grid Search with 5-fold cross-validation
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
def mlp_model_new(X, Y):

    estimator = MLPRegressor(max_iter= 20000, n_iter_no_change = 10)
    # 

    param_grid = [
        {
            'activation' : ['identity', 'logistic', 'tanh', 'relu'],
            'solver' : ['lbfgs', 'sgd', 'adam'],
            'hidden_layer_sizes': [
             (1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,),(12,),(13,),(14,),(15,),(16,)
             ]
        }
       ]

    gsc = GridSearchCV(
        estimator,
        param_grid,
        cv=5, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X, Y)


    best_params = grid_result.best_params_

    best_mlp = MLPRegressor(hidden_layer_sizes = best_params["hidden_layer_sizes"], 
                            activation =best_params["activation"],
                            solver=best_params["solver"],
                            max_iter= 20000, n_iter_no_change = 10,
                  )

    scoring = {
               'abs_error': 'neg_mean_absolute_error',
               'squared_error': 'neg_mean_squared_error',
               'r2':'r2'}

    scores = cross_validate(best_mlp, X, Y, cv=5, scoring=scoring, return_train_score=True, return_estimator = True)
    return scores

In [None]:
# Run the grid search
scores_new = mlp_model_new(X_train_new, y_train)
scores_new # Display the best model parameters

In [34]:
# Load the model 
import pickle
filename = 'MLP_Final.sav'
NN = pickle.load(open(filename, 'rb'))
print(NN)

# Use it for predicting from unseen data
y_hat = NN.predict(X_test_new)

MLPRegressor(activation='logistic', hidden_layer_sizes=(11,), max_iter=20000,
             solver='sgd')


In [41]:
# MLP
sd = np.std(y_test - y_hat)
b = np.ones(100) * np.mean(y_test - y_hat)
lb, ub = b - 1.96 * sd, b + 1.96 * sd

print("# -- MLP -- #\nBias: %.0f -- LB = %.0f -- UB = %.0f" % (b[0], lb[0], ub[0]))
print("MAE +/- SD = %.0f +/- %.0f" % (np.mean(np.abs(y_test - y_hat)), np.std(np.abs(y_test - y_hat))))
print("Accuracy = %.0f" % np.sqrt(((((y_test - y_hat)** 2))).mean()).round())
print("Precision = %.0f" % np.std(y_test - y_hat).round())
tau_new, p_val_new = stats.kendalltau(.5 * (y_test + y_hat), np.abs((y_test - y_hat)))
print("Kendall's tau = %.2f (p = %.3f)" % (tau_new, p_val_new))

# -- MLP -- #
Bias: 0 -- LB = -8 -- UB = 8
MAE +/- SD = 3 +/- 3
Accuracy = 4
Precision = 4
Kendall's tau = -0.02 (p = 0.842)
