In [1]:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import figure
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import train_test_split

References:
1. https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/

In [2]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)


In [3]:
def split_sequences(Xs, ys, n_steps):
    X, y = list(), list()
    for i in range(len(ys)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(ys):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = Xs[i: end_ix], ys[end_ix - 1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y).squeeze()

In [4]:
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from skorch import NeuralNetBinaryClassifier


class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()

        self.lstm = nn.LSTM(input_size=20, hidden_size=10, num_layers=1, batch_first=True)
        self.dense = nn.Linear(10, 1)

    def forward(self, X, **kwargs):
        output, hidden = self.lstm(X)
        X = self.dense(output[:, -1, :])
        return X

def get_model():
    model = NeuralNetBinaryClassifier(
        MyModule,
        optimizer=Adam,
        max_epochs=50,
        lr=3e-4,
        batch_size=16,
        iterator_train__shuffle=True,
    )
    return model

In [5]:
df_path = "../merged_data/features_USRECD.csv"
features = ["BCI", "BCIp", "BCIg", 'IE_SP_Comp', 'IE_SP_Dividend', 'IE_SP_Earnings', 'IE_Consumer_CPI', 'IE_Long_Interest', 'IE_Real_Price', 'IE_Real_Dividend', 'IE_Return_Price', 'IE_Real_Earnings',
                'IE_Scaled_Earnings', 'IE_Monthly_Returns', 'IE_Real_Returns', "YC_10_Year", "YC_3_Month", "YC_3_Month_Bond", "YC_Spread", "YC_Rec_Prob"]


In [6]:
# Read the data and do a little bit of wrangling:
df = pd.read_csv(df_path)
df.Date = pd.to_datetime(df.Date)
df = df.set_index("Date", drop=True)
df = df.drop(columns="Unnamed: 0")
df.head()

Unnamed: 0_level_0,BCI,BCIp,BCIg,USRECD,IE_SP_Comp,IE_SP_Dividend,IE_SP_Earnings,IE_Consumer_CPI,IE_Long_Interest,IE_Real_Price,IE_Real_Dividend,IE_Return_Price,IE_Real_Earnings,IE_Scaled_Earnings,IE_Monthly_Returns,IE_Real_Returns,YC_10_Year,YC_3_Month,YC_3_Month_Bond,YC_Spread,YC_Rec_Prob
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1967-02-09,4.6052,6.587,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-02-16,4.6052,6.5863,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-02-23,4.6012,6.5774,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-03-02,4.6032,6.582,3.4751,0,4.47,1.0613,1.7011,3.4935,1.5326,6.586,3.1772,11.4869,3.8177,8.7185,0.01,2.5153,1.5326,1.5173,1.5427,1.2692,-1.2586
1967-03-09,4.6042,6.5852,3.4751,0,4.47,1.0613,1.7011,3.4935,1.5326,6.586,3.1772,11.4869,3.8177,8.7185,0.01,2.5153,1.5326,1.5173,1.5427,1.2692,-1.2586


In [7]:
# Split into training and test sets and hold out the test set until the end, so that it remains "unseen".
lag_of_y = 21 # This is the lag we introduce to the target variable so that we assess the indicator's 
              # ability to predict the target variable this many steps into the future.
              # With BCI, a lag of 21 data points corresponds to about half a year.
        
X_train, y_train = df.iloc[:-lag_of_y, df.columns != "USRECD"], df.iloc[lag_of_y:, df.columns == "USRECD"]

In [8]:
# Do a time series cross-validation on the test set by splitting it to k folds and doing a "rolling"
# validation against a validation fold, then averaging out the metrics.
splits = 4 # This is the number of splits/folds in the rolling validation.
tscv = TimeSeriesSplit(n_splits=splits)

for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29 ... 529 530 531 532 533
 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
 552 553 554 555 556 557 558] TEST: [ 559  560  561  562  563  564  565  566  567  568  569  570  571  572
  573  574  575  576  577  578  579  580  581  582  583  584  585  586
  587  588 ... 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
 1110 1111 1112 1113]
TRAIN: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29 ... 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
 1110 1111 1112 1113] TEST: [1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
 1128 1129 1130 1131 1132 1133 113

## Validation

In [9]:
AUC_ROCs = dict()
ACCs = dict()
model_name = "LSTM"
print(model_name)
AUC_ROCs[model_name] = 0
ACCs[model_name] = 0
for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
    X_train_fold, X_validation_fold = X_train.iloc[train_index[:-lag_of_y], X_train.columns != "USRECD"], \
        X_train.iloc[test_index[:-lag_of_y], X_train.columns != "USRECD"]
    y_train_fold, y_validation_fold = y_train.iloc[train_index[lag_of_y:], y_train.columns == "USRECD"], \
        y_train.iloc[test_index[lag_of_y:], y_train.columns == "USRECD"]

    scalers = dict()
    for feature in features:
        scalers[feature] = StandardScaler()
        scalers[feature].fit(X_train_fold[[feature]])
        X_train_fold[feature] = scalers[feature].transform(X_train_fold[[feature]])
        X_validation_fold[feature] = scalers[feature].transform(X_validation_fold[[feature]])

    X_train_fold, y_train_fold = split_sequences(X_train_fold.to_numpy(), y_train_fold.to_numpy(), n_steps=10)
    X_train_fold = X_train_fold.astype(np.float32)
    y_train_fold = y_train_fold.astype(np.float32)
    X_validation_fold, y_validation_fold = split_sequences(X_validation_fold.to_numpy(), y_validation_fold.to_numpy(), n_steps=10)
    X_validation_fold = X_validation_fold.astype(np.float32)
    y_validation_fold = y_validation_fold.astype(np.float32)
    model = get_model()
    model.fit(X_train_fold, y_train_fold)
    positive_probs = [p[1] for p in model.predict_proba(X_validation_fold)]
    AUC_ROC = metrics.roc_auc_score(y_validation_fold, positive_probs)
    AUC_ROCs[model_name] += AUC_ROC
    predictions = model.predict(X_validation_fold)
    ACC = accuracy_score(y_validation_fold, predictions)
    ACCs[model_name] += ACC
    print(AUC_ROC, ACC)

AUC_ROCs[model_name] /= splits
ACCs[model_name] /= splits

LSTM
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.7090[0m       [32m0.2075[0m        [35m0.7411[0m  0.0318
      2        [36m0.6799[0m       [32m0.2453[0m        [35m0.7228[0m  0.0296
      3        [36m0.6510[0m       [32m0.4340[0m        [35m0.7033[0m  0.0293
      4        [36m0.6212[0m       [32m0.5660[0m        [35m0.6899[0m  0.0286
      5        [36m0.5918[0m       [32m0.6038[0m        [35m0.6708[0m  0.0289
      6        [36m0.5622[0m       [32m0.6604[0m        [35m0.6552[0m  0.0296
      7        [36m0.5336[0m       [32m0.6981[0m        [35m0.6373[0m  0.0294
      8        [36m0.5051[0m       [32m0.7358[0m        [35m0.6205[0m  0.0290
      9        [36m0.4777[0m       [32m0.7736[0m        [35m0.6032[0m  0.0296
     10        [36m0.4514[0m       0.7642        [35m0.5867[0m  0.0290
     11        [36m0.4264[0m       [32m0.8019[

In [10]:
print(model_name)
print(f"AUC ROC: {AUC_ROCs[model_name]}")
print(f"accuracy: {ACCs[model_name]}")

LSTM
AUC ROC: 0.6595442604426236
accuracy: 0.7976190476190476
