In [177]:
# data setup
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# import data from file
pd_data = pd.read_excel(r'adjusted_macro_features.xlsx', sheet_name='features')
pd_data = pd_data.sample(frac=1)

# convert data
data = pd_data.to_numpy()
train = data[:3800]
test = data[3800:]

train_x = train[:,1:]
train_y = train[:,0]
test_x = test[:,1:]
test_y = test[:,0]


In [178]:
# evaluation metrics
from sklearn.metrics import mean_absolute_percentage_error, explained_variance_score, max_error, r2_score

print('Evaluation Metrics:')
print(' MAPE: Mean Absolute Percentage Error (1 = 100% error; lower is better)')
print(' EV: Explained Variance ([0, 1], higher is better)')
print(' MAX: Max Error (raw number, lower is better)')
print()

Evaluation Metrics:
 MAPE: Mean Absolute Percentage Error (1 = 100% error; lower is better)
 EV: Explained Variance ([0, 1], higher is better)
 MAX: Max Error (raw number, lower is better)



In [179]:
# Linear Regression
from sklearn import linear_model

# fit model
lr_model = linear_model.LinearRegression()
lr_fitted = lr_model.fit(train_x, train_y)

# test
lr_pred_y = lr_fitted.predict(test_x)

# evaluation of Linear Regression
print('LR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, lr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, lr_pred_y)))
print(' MAX: ' + str(max_error(test_y, lr_pred_y)))
print()

LR model:
 MAPE: 1.4594868015272269
 EV: 0.10600130110611061
 MAX: 31.30573005472954



In [180]:
# Ridge Regression

# hyperparameters
alpha = 1

# fit model
rr_model = linear_model.Ridge(alpha)
rr_fitted = rr_model.fit(train_x, train_y)

# test
rr_pred_y = rr_fitted.predict(test_x)

# evaluation of Ridge Regression
print('RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, rr_pred_y)))
print()

RR model:
 MAPE: 1.4594274820730626
 EV: 0.10599329507931932
 MAX: 31.30588244015008



In [181]:
# Random Forests
from sklearn import ensemble

# hyperparameters
max_depth = None
max_features = 0.5

# test different numbers of trees (50 seems to be best)
for i in [10, 20, 30, 40, 50, 100, 200, 500]:
  n_estimators = i
  
  # fit model
  rf_model = ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
  rf_fitted = rf_model.fit(train_x, train_y)

  # test
  rf_pred_y = rf_fitted.predict(test_x)

  # evaluation of Random Forest
  print('RF model (' + str(i) + ' trees):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rf_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, rf_pred_y)))
  print(' MAX: ' + str(max_error(test_y, rf_pred_y)))
  print()

RF model (10 trees):
 MAPE: 1.093233750183711
 EV: 0.831110486369741
 MAX: 11.385057826919738

RF model (20 trees):
 MAPE: 1.0288800441310277
 EV: 0.8215288713399911
 MAX: 13.205598790030853

RF model (30 trees):
 MAPE: 0.999037181262781
 EV: 0.8357931144125668
 MAX: 11.41502223668337

RF model (40 trees):
 MAPE: 0.9713697836217793
 EV: 0.8368641672431506
 MAX: 10.874950797394758

RF model (50 trees):
 MAPE: 0.9456079580371276
 EV: 0.8500698466520283
 MAX: 10.164067101546646

RF model (100 trees):
 MAPE: 0.9429226303375307
 EV: 0.8485873473897688
 MAX: 10.072649069902909

RF model (200 trees):
 MAPE: 0.9649978358039425
 EV: 0.85122923777356
 MAX: 10.947975195135577

RF model (500 trees):
 MAPE: 0.9632976509385384
 EV: 0.8502392262844651
 MAX: 11.27604171037704



In [185]:
# Random Forests
from sklearn import ensemble

# hyperparameters
max_depth = None
n_estimators = 50

# test different numbers for percentage of features used (0.3 seems to be best)
for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
  max_features = i
  
  # fit model
  rf_model = ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
  rf_fitted = rf_model.fit(train_x, train_y)

  # test
  rf_pred_y = rf_fitted.predict(test_x)

  # evaluation of Random Forest
  print('RF model (max_features = ' + str(i) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rf_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, rf_pred_y)))
  print(' MAX: ' + str(max_error(test_y, rf_pred_y)))
  print()

RF model (max_features = 0.1):
 MAPE: 0.9622850325002541
 EV: 0.8020792063793278
 MAX: 12.695736026860448

RF model (max_features = 0.2):
 MAPE: 1.014785956217333
 EV: 0.8424864422665324
 MAX: 10.349459851749613

RF model (max_features = 0.3):
 MAPE: 0.9501127748303018
 EV: 0.8432579072243712
 MAX: 10.619128965099112

RF model (max_features = 0.4):
 MAPE: 0.9291555790532946
 EV: 0.8438624724268549
 MAX: 11.711479107251515

RF model (max_features = 0.5):
 MAPE: 0.9582262458882704
 EV: 0.8395647818354786
 MAX: 11.962955187966347

RF model (max_features = 0.6):
 MAPE: 1.0162632869758057
 EV: 0.8457546341517502
 MAX: 10.776874014579565

RF model (max_features = 0.7):
 MAPE: 1.0073038180120266
 EV: 0.8327223147205658
 MAX: 11.856425376936405

RF model (max_features = 0.8):
 MAPE: 1.0176563701735122
 EV: 0.8442774437573017
 MAX: 12.756235026342306

RF model (max_features = 0.9):
 MAPE: 0.9519455977899722
 EV: 0.8510790535252338
 MAX: 10.60227138933299

RF model (max_features = 1):
 MAPE: 0.9

In [188]:
# KNN
from sklearn import neighbors

# hyperparameters
weights = "uniform" # uniform weights, could instead be "distance"

# test different values for k (3 seems to be best)
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50]:
  num_neighbors = k

  # fit model
  knn_model = neighbors.KNeighborsRegressor(num_neighbors, weights=weights)
  knn_fitted = knn_model.fit(train_x, train_y)

  # test
  knn_pred_y = knn_fitted.predict(test_x)

  # evaluation of KNN
  print('KNN model (k = ' + str(k) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, knn_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, knn_pred_y)))
  print(' MAX: ' + str(max_error(test_y, knn_pred_y)))
  print()

KNN model (k = 1):
 MAPE: 1.3757771112026875
 EV: 0.6542440299288677
 MAX: 31.3715031139679

KNN model (k = 2):
 MAPE: 1.411441119804068
 EV: 0.6483677770946704
 MAX: 19.612433425014395

KNN model (k = 3):
 MAPE: 1.3248785433819803
 EV: 0.614929523111452
 MAX: 21.665143077834514

KNN model (k = 4):
 MAPE: 1.427419079797442
 EV: 0.5728821828164918
 MAX: 20.82228785626504

KNN model (k = 5):
 MAPE: 1.3901334808675976
 EV: 0.5393783844833656
 MAX: 22.515420040075394

KNN model (k = 6):
 MAPE: 1.4149532530097506
 EV: 0.5034252094766773
 MAX: 23.559418859783815

KNN model (k = 7):
 MAPE: 1.463922153391278
 EV: 0.47575453350797836
 MAX: 23.754924704531383

KNN model (k = 8):
 MAPE: 1.4636834688624145
 EV: 0.4462155144733544
 MAX: 23.98015639748787

KNN model (k = 9):
 MAPE: 1.4926055554800466
 EV: 0.418841176411072
 MAX: 24.30633080843147

KNN model (k = 10):
 MAPE: 1.5226534598915245
 EV: 0.37940226380623987
 MAX: 23.142932218591298

KNN model (k = 20):
 MAPE: 1.5523203419268272
 EV: 0.2887

In [189]:
# SVM
from sklearn import svm

# set up hyperparameters?

# fit model
svm_model = svm.SVR() # could instead be LinearSVR (faster) or NuSVR (different formulation)
svm_fitted = svm_model.fit(train_x, train_y)

# test
svm_pred_y = svm_fitted.predict(test_x)

# evaluation of SVM
print('SVM model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, svm_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, svm_pred_y)))
print(' MAX: ' + str(max_error(test_y, svm_pred_y)))
print()

SVM model:
 MAPE: 1.6690382981428444
 EV: 0.15751589871136173
 MAX: 28.01213244677832



In [190]:
# Neural Net (Multi Layer Perceptron)
from sklearn import neural_network

# hyperparameters (there are more than these)
activation = 'relu'

# test different values for alpha (0.01 seems to be best)
for a in [0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]:
  alpha = a

  # fit model
  mlp_model = neural_network.MLPRegressor(activation=activation, alpha=alpha, max_iter=500)
  mlp_fitted = mlp_model.fit(train_x, train_y)

  # test
  mlp_pred_y = mlp_fitted.predict(test_x)

  # evaluation of MLP
  print('MLP model (alpha = ' + str(a) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, mlp_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, mlp_pred_y)))
  print(' MAX: ' + str(max_error(test_y, mlp_pred_y)))
  print()

MLP model (alpha = 5e-05):
 MAPE: 1.6929751630228693
 EV: 0.4463196699460181
 MAX: 24.030114125690353

MLP model (alpha = 0.0001):
 MAPE: 2.226083769546122
 EV: 0.4168962942306772
 MAX: 25.955027339398455

MLP model (alpha = 0.0002):
 MAPE: 1.6211177417540172
 EV: 0.49076453871580283
 MAX: 21.046569312902513

MLP model (alpha = 0.0003):
 MAPE: 1.6760322995580605
 EV: 0.48382321451073584
 MAX: 22.442742042717043

MLP model (alpha = 0.0004):
 MAPE: 1.7304329783307892
 EV: 0.484889008176706
 MAX: 19.65181154358579

MLP model (alpha = 0.0005):
 MAPE: 1.7185314582060915
 EV: 0.46361091464191984
 MAX: 22.353205739175436

MLP model (alpha = 0.001):
 MAPE: 1.6190239189670677
 EV: 0.4644319516456934
 MAX: 22.036532363978395

MLP model (alpha = 0.005):
 MAPE: 1.7099712845222113
 EV: 0.4046048036767166
 MAX: 23.004188860999868

MLP model (alpha = 0.01):
 MAPE: 1.522286363200566
 EV: 0.5285079742052203
 MAX: 22.24796793167023

MLP model (alpha = 0.05):
 MAPE: 1.8530114819664911
 EV: 0.456710895611

In [191]:
# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_trimmed = train_x[:,5:8]
test_x_trimmed = test_x[:,5:8]

# fit model
trimmed_rr_model = linear_model.Ridge()
trimmed_rr_fitted = trimmed_rr_model.fit(train_x_trimmed, train_y)

# test
trimmed_rr_pred_y = trimmed_rr_fitted.predict(test_x_trimmed)

# evaluation of Ridge Regression
print('Trimmed RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, trimmed_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, trimmed_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, trimmed_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, trimmed_rr_pred_y)))


Trimmed RR model:
 MAPE: 1.3098610341697563
 EV: 0.004583228966891673
 MAX: 32.40608071540558
 R2 Score: 0.0019264055638528177


In [195]:
# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_single = train_x[:,5]
test_x_single = test_x[:,5]

# fit model
single_rr_model = linear_model.Ridge()
single_rr_fitted = single_rr_model.fit(train_x_single.reshape(-1, 1), train_y)

# test
single_rr_pred_y = single_rr_fitted.predict(test_x_single.reshape(-1, 1))

# evaluation of Ridge Regression
print('Single RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, single_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, single_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, single_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, single_rr_pred_y)))


Single RR model:
 MAPE: 1.3029707382118898
 EV: 0.0007000020963148224
 MAX: 32.29062260454004
 R2 Score: -0.002060171589892912


In [193]:
# standardized version
from sklearn import preprocessing

# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_trimmed = train_x[:,5:8]
test_x_trimmed = test_x[:,5:8]

# standardize scale
scaler = preprocessing.StandardScaler().fit(train_x_trimmed)
train_x_trimmed_scaled = scaler.transform(train_x_trimmed)
test_x_trimmed_scaled = scaler.transform(test_x_trimmed)

# fit model
trimmed_rr_model = linear_model.Ridge()
trimmed_rr_fitted = trimmed_rr_model.fit(train_x_trimmed_scaled, train_y)

# test
trimmed_rr_pred_y = trimmed_rr_fitted.predict(test_x_trimmed_scaled)

# evaluation of Ridge Regression
print('Trimmed RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, trimmed_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, trimmed_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, trimmed_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, trimmed_rr_pred_y)))

# TLDR: standardizing doesn't do much

Trimmed RR model:
 MAPE: 1.3098402707624255
 EV: 0.004578423243388197
 MAX: 32.40575951705523
 R2 Score: 0.0019213733089672447
