In [28]:
# data setup
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# import data from file
pd_data = pd.read_excel(r'augmented_macro_features.xlsx', sheet_name='features')
pd_data = pd_data.sample(frac=1)

# convert data
data = pd_data.to_numpy()
train = data[:4000]
test = data[4000:]

train_x = train[:,1:]
train_y = train[:,0]
test_x = test[:,1:]
test_y = test[:,0]


In [131]:
# evaluation metrics
from sklearn.metrics import mean_absolute_percentage_error, explained_variance_score, max_error, r2_score

print('Evaluation Metrics:')
print(' MAPE: Mean Absolute Percentage Error (1 = 100% error; lower is better)')
print(' EV: Explained Variance ([0, 1], higher is better)')
print(' MAX: Max Error (raw number, lower is better)')
print()

Evaluation Metrics:
 MAPE: Mean Absolute Percentage Error (1 = 100% error; lower is better)
 EV: Explained Variance ([0, 1], higher is better)
 MAX: Max Error (raw number, lower is better)



In [99]:
# Linear Regression
from sklearn import linear_model

# fit model
lr_model = linear_model.LinearRegression()
lr_fitted = lr_model.fit(train_x, train_y)

# test
lr_pred_y = lr_fitted.predict(test_x)

# evaluation of Linear Regression
print('LR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, lr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, lr_pred_y)))
print(' MAX: ' + str(max_error(test_y, lr_pred_y)))
print()

LR model:
 MAPE: 1.896266298819565
 EV: 0.6409134974348276
 MAX: 18.32136211018026



In [110]:
# Ridge Regression

# hyperparameters
alpha = 1

# fit model
rr_model = linear_model.Ridge(alpha)
rr_fitted = rr_model.fit(train_x, train_y)

# test
rr_pred_y = rr_fitted.predict(test_x)

# evaluation of Ridge Regression
print('RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, rr_pred_y)))
print()

RR model:
 MAPE: 1.8961814419746374
 EV: 0.6409195049114356
 MAX: 18.321767423434483



In [114]:
# Random Forests
from sklearn import ensemble

# hyperparameters
max_depth = None
max_features = 0.5

# test different numbers of trees (40 seems to be best)
for i in [10, 20, 30, 40, 50, 100, 200, 500]:
  n_estimators = i
  
  # fit model
  rf_model = ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
  rf_fitted = rf_model.fit(train_x, train_y)

  # test
  rf_pred_y = rf_fitted.predict(test_x)

  # evaluation of Random Forest
  print('RF model (' + str(i) + ' trees):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rf_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, rf_pred_y)))
  print(' MAX: ' + str(max_error(test_y, rf_pred_y)))
  print()

RF model (10 trees):
 MAPE: 1.6327125422592712
 EV: 0.8048694442347213
 MAX: 11.724140774364358

RF model (20 trees):
 MAPE: 1.4239736035878106
 EV: 0.8183696961344338
 MAX: 12.682857824162665

RF model (30 trees):
 MAPE: 1.5012884354716327
 EV: 0.8192651563011109
 MAX: 13.248855413947586

RF model (40 trees):
 MAPE: 1.3262540978355317
 EV: 0.8239574488091255
 MAX: 12.43344689963114

RF model (50 trees):
 MAPE: 1.4105350070324334
 EV: 0.8235858617896669
 MAX: 13.59422787010687

RF model (100 trees):
 MAPE: 1.3823816406711604
 EV: 0.8237369330638293
 MAX: 12.667780336352827

RF model (200 trees):
 MAPE: 1.3715451227231785
 EV: 0.8269972947846692
 MAX: 12.918225011420716

RF model (500 trees):
 MAPE: 1.3999596462843313
 EV: 0.8330401936179711
 MAX: 12.070347174044343



In [116]:
# Random Forests
from sklearn import ensemble

# hyperparameters
max_depth = None
n_estimators = 40

# test different numbers for percentage of features used (1 seems to be best)
for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
  max_features = i
  
  # fit model
  rf_model = ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
  rf_fitted = rf_model.fit(train_x, train_y)

  # test
  rf_pred_y = rf_fitted.predict(test_x)

  # evaluation of Random Forest
  print('RF model (max_features = ' + str(i) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, rf_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, rf_pred_y)))
  print(' MAX: ' + str(max_error(test_y, rf_pred_y)))
  print()

RF model (max_features = 0.1):
 MAPE: 1.2251031538204211
 EV: 0.8369634502061662
 MAX: 12.001151828014098

RF model (max_features = 0.2):
 MAPE: 1.4712881535809788
 EV: 0.8407437331687235
 MAX: 12.52597030331758

RF model (max_features = 0.3):
 MAPE: 1.4210362325244674
 EV: 0.834150172452086
 MAX: 11.681101749804464

RF model (max_features = 0.4):
 MAPE: 1.4223922053887585
 EV: 0.8261381722319581
 MAX: 12.12482985752198

RF model (max_features = 0.5):
 MAPE: 1.3943649577106298
 EV: 0.8233580616031753
 MAX: 12.765162430534138

RF model (max_features = 0.6):
 MAPE: 1.3853655027141072
 EV: 0.8216325457187601
 MAX: 13.835071074101455

RF model (max_features = 0.7):
 MAPE: 1.6143170838089753
 EV: 0.819919102840764
 MAX: 14.135876303295133

RF model (max_features = 0.8):
 MAPE: 1.4526121183007876
 EV: 0.8208258664762146
 MAX: 12.322249180430134

RF model (max_features = 0.9):
 MAPE: 1.445507330153936
 EV: 0.814145233042066
 MAX: 14.235900976437682

RF model (max_features = 1):
 MAPE: 1.17782

In [120]:
# KNN
from sklearn import neighbors

# hyperparameters
weights = "uniform" # uniform weights, could instead be "distance"

# test different values for k (6 seems to be best)
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50]:
  num_neighbors = k

  # fit model
  knn_model = neighbors.KNeighborsRegressor(num_neighbors, weights=weights)
  knn_fitted = knn_model.fit(train_x, train_y)

  # test
  knn_pred_y = knn_fitted.predict(test_x)

  # evaluation of KNN
  print('KNN model (k = ' + str(k) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, knn_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, knn_pred_y)))
  print(' MAX: ' + str(max_error(test_y, knn_pred_y)))
  print()

KNN model (k = 1):
 MAPE: 2.268941076929506
 EV: 0.7213100050736468
 MAX: 14.095244500525926

KNN model (k = 2):
 MAPE: 2.1820564910895146
 EV: 0.7777787324626154
 MAX: 12.028219932032725

KNN model (k = 3):
 MAPE: 2.014521550527724
 EV: 0.7761795411187308
 MAX: 14.88917944281847

KNN model (k = 4):
 MAPE: 1.7909102469370306
 EV: 0.7829480274114182
 MAX: 12.539248514274034

KNN model (k = 5):
 MAPE: 1.7259681697541056
 EV: 0.7780030064198209
 MAX: 13.537621880658598

KNN model (k = 6):
 MAPE: 1.6279664830562153
 EV: 0.770766951099797
 MAX: 14.173002828689324

KNN model (k = 7):
 MAPE: 1.6697504751978218
 EV: 0.7632357628601817
 MAX: 14.218274380676984

KNN model (k = 8):
 MAPE: 1.6727383755956438
 EV: 0.7556380382113017
 MAX: 16.971573784235055

KNN model (k = 9):
 MAPE: 1.7422108759188544
 EV: 0.7429487715574974
 MAX: 19.03764943636862

KNN model (k = 10):
 MAPE: 1.7551306065131005
 EV: 0.7349421744507866
 MAX: 20.89573563069568

KNN model (k = 20):
 MAPE: 1.7215559563434775
 EV: 0.67

In [124]:
# SVM
from sklearn import svm

# set up hyperparameters?

# fit model
svm_model = svm.SVR() # could instead be LinearSVR (faster) or NuSVR (different formulation)
svm_fitted = svm_model.fit(train_x, train_y)

# test
svm_pred_y = svm_fitted.predict(test_x)

# evaluation of SVM
print('SVM model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, svm_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, svm_pred_y)))
print(' MAX: ' + str(max_error(test_y, svm_pred_y)))
print()

SVM model:
 MAPE: 1.7686557997000003
 EV: 0.5897110522392015
 MAX: 32.61746744901192



In [122]:
# Neural Net (Multi Layer Perceptron)
from sklearn import neural_network

# hyperparameters (there are more than these)
activation = 'relu'

# test different values for alpha (0.0004 seems to be best)
for a in [0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]:
  alpha = a

  # fit model
  mlp_model = neural_network.MLPRegressor(activation=activation, alpha=alpha, max_iter=500)
  mlp_fitted = mlp_model.fit(train_x, train_y)

  # test
  mlp_pred_y = mlp_fitted.predict(test_x)

  # evaluation of MLP
  print('MLP model (alpha = ' + str(a) + '):')
  print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, mlp_pred_y)))
  print(' EV: ' + str(explained_variance_score(test_y, mlp_pred_y)))
  print(' MAX: ' + str(max_error(test_y, mlp_pred_y)))
  print()

MLP model (alpha = 5e-05):
 MAPE: 1.8753924981103693
 EV: 0.7467594839122329
 MAX: 13.305382654108733

MLP model (alpha = 0.0001):
 MAPE: 2.107500061793368
 EV: 0.7204098013926086
 MAX: 13.398354353570205

MLP model (alpha = 0.0002):
 MAPE: 2.0041742881535685
 EV: 0.7616339312493687
 MAX: 13.138076015364769

MLP model (alpha = 0.0003):
 MAPE: 2.112643442974958
 EV: 0.7318283809552071
 MAX: 15.236207853604352

MLP model (alpha = 0.0004):
 MAPE: 1.8540247078646868
 EV: 0.7567024929577766
 MAX: 13.594260865649705

MLP model (alpha = 0.0005):
 MAPE: 1.917423276695411
 EV: 0.7494911845455903
 MAX: 13.628065623612374

MLP model (alpha = 0.001):
 MAPE: 1.9753379658325434
 EV: 0.7499589894509922
 MAX: 13.960926697904036

MLP model (alpha = 0.005):
 MAPE: 1.892523504028804
 EV: 0.7479624327906
 MAX: 13.702736874636518

MLP model (alpha = 0.01):
 MAPE: 1.8461909616915924
 EV: 0.7554357478670101
 MAX: 13.981240579706803

MLP model (alpha = 0.05):
 MAPE: 2.104739182657119
 EV: 0.7423351042779215
 

In [170]:
# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_trimmed = train_x[:,5:8]
test_x_trimmed = test_x[:,5:8]

# fit model
trimmed_rr_model = linear_model.Ridge()
trimmed_rr_fitted = trimmed_rr_model.fit(train_x_trimmed, train_y)

# test
trimmed_rr_pred_y = trimmed_rr_fitted.predict(test_x_trimmed)

# evaluation of Ridge Regression
print('Trimmed RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, trimmed_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, trimmed_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, trimmed_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, trimmed_rr_pred_y)))


Trimmed RR model:
 MAPE: 1.9004162697662903
 EV: 0.6209391416900756
 MAX: 18.428323586863296
 R2 Score: 0.620270205597264


In [171]:
# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_single = train_x[:,7]
test_x_single = test_x[:,7]

# fit model
single_rr_model = linear_model.Ridge()
single_rr_fitted = single_rr_model.fit(train_x_single.reshape(-1, 1), train_y)

# test
single_rr_pred_y = single_rr_fitted.predict(test_x_single.reshape(-1, 1))

# evaluation of Ridge Regression
print('Single RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, single_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, single_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, single_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, single_rr_pred_y)))


Single RR model:
 MAPE: 2.1816213244831744
 EV: 0.4984564031578399
 MAX: 19.50058974707359
 R2 Score: 0.49832898227672917


In [172]:
# standardized version
from sklearn import preprocessing

# hand-select the features that look linearly correlated with the target, and try linear regression on those
train_x_trimmed = train_x[:,5:8]
test_x_trimmed = test_x[:,5:8]

# standardize scale
scaler = preprocessing.StandardScaler().fit(train_x_trimmed)
train_x_trimmed_scaled = scaler.transform(train_x_trimmed)
test_x_trimmed_scaled = scaler.transform(test_x_trimmed)

# fit model
trimmed_rr_model = linear_model.Ridge()
trimmed_rr_fitted = trimmed_rr_model.fit(train_x_trimmed_scaled, train_y)

# test
trimmed_rr_pred_y = trimmed_rr_fitted.predict(test_x_trimmed_scaled)

# evaluation of Ridge Regression
print('Trimmed RR model:')
print(' MAPE: ' + str(mean_absolute_percentage_error(test_y, trimmed_rr_pred_y)))
print(' EV: ' + str(explained_variance_score(test_y, trimmed_rr_pred_y)))
print(' MAX: ' + str(max_error(test_y, trimmed_rr_pred_y)))
print(' R2 Score: ' + str(r2_score(test_y, trimmed_rr_pred_y)))

# TLDR: standardizing doesn't do much

Trimmed RR model:
 MAPE: 1.9002633313987518
 EV: 0.6209346412382502
 MAX: 18.42786144499231
 R2 Score: 0.6202661484151222
