In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import datetime

import random
from collections import deque
from sklearn import preprocessing


def date(x):
    return datetime.datetime.strptime(x, '%Y-%m-%d')


def train_test_split(df, train_split=0.8):
    # This splits the data into train and test splits
    obs = df.shape[0]
    train_split_point = int(train_split * obs)
    # Sort date columns to ensure its in order n then you can split
    df['prediction_day'] = df['prediction_day'].apply(date)
    df.sort_values(by='prediction_day', ascending=True)

    df['time_int'] = df['prediction_day'].apply(lambda x: x.value)

    # Split the data first and then do the rest
    train = df.iloc[:train_split_point]
    test = df.iloc[train_split_point:]

    return train.values, test.values

## Change this for RNN
def train_validation_split(df, val_split=0.8):
    import random
    obs = df.shape[0]
    train_split_point = int(val_split * obs)

    random.shuffle(df)
    # Split the data first and then do the rest
    train = df[:train_split_point, :]
    val = df[train_split_point:, :]

    np.random.shuffle(train)
    np.random.shuffle(val)

    return train, val


# make a function which is given train validation test and then does processing all to do with training data
# regardless of the CV type, it does it correctly

def sequencer(df, seq_len=5):
    # This function is specifically for sequencing the data so it can be used with an RNN
    sequential_data = []
    prev_days = deque(maxlen=seq_len)

    for i in df:  # iterate over the values
        prev_days.append([n for n in i[1:-2]])
        if len(prev_days) == seq_len:
            sequential_data.append([np.array(prev_days), i[-2]])
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)


def processing_cv(train, validation, test, seq=False, seq_length=5, fg = (False, 0)):
    # This function uses the data from train, val test split and processes data in the correct manner so there is no
    # data leakage. All scaling and sequencing is done so there is no data leakage to ensure the robustness of the
    # model . Must sort again just in case
    train_s = train[np.argsort(train[:, -1])]
    validation_s = validation[np.argsort(validation[:, -1])]
    test_s = test[np.argsort(test[:, -1])]

    # Scale
    train_scaled = train_s.copy()
    validation_scaled = validation_s.copy()
    test_scaled = test_s.copy()

    if not fg[0]:

        features_train = train_scaled[:, 1:-2]
        features_validation = validation_scaled[:, 1:-2]
        features_test = test_scaled[:, 1:-2]

        scaler = StandardScaler().fit(features_train)

        features_train1 = scaler.transform(features_train)
        features_validation1 = scaler.transform(features_validation)
        features_test1 = scaler.transform(features_test)

        train_scaled[:, 1:-2] = features_train1
        validation_scaled[:, 1:-2] = features_validation1
        test_scaled[:, 1:-2] = features_test1

    else:
        col_num = fg[1]
        features_train = train_scaled[:, 1:col_num]
        features_validation = validation_scaled[:, 1:col_num]
        features_test = test_scaled[:, 1:col_num]
        scaler = StandardScaler().fit(features_train)
        features_train1 = scaler.transform(features_train)
        features_validation1 = scaler.transform(features_validation)
        features_test1 = scaler.transform(features_test)
        train_scaled[:, 1:col_num] = features_train1
        validation_scaled[:, 1:col_num] = features_validation1
        test_scaled[:, 1:col_num] = features_test1

    if seq:
        # sequence for rnn
        x_train, y_train = sequencer(train_scaled, seq_len=seq_length)
        x_val, y_val = sequencer(validation_scaled, seq_len=seq_length)
        x_test, y_test = sequencer(test_scaled, seq_len=seq_length)

        return x_train.astype(np.float), y_train.astype(np.float) \
            , x_val.astype(np.float), y_val.astype(np.float) \
            , x_test.astype(np.float), y_test.astype(np.float)

    return train_scaled[:, 1:-2].astype(np.float), train_scaled[:, -2].astype(np.float), \
           validation_scaled[:, 1:-2].astype(np.float), validation_scaled[:, -2].astype(np.float), \
           test_scaled[:, 1:-2].astype(np.float), test_scaled[:, -2].astype(np.float)


def processing_test(train, test, seq=False, seq_length=5):
    # This function uses the data from train, val test split and processes data in the correct manner so there is no
    # data leakage. All scaling and sequencing is done so there is no data leakage to ensure the robustness of the
    # model . Must sort again just in case
    train_s = train[np.argsort(train[:, -1])]
    test_s = test[np.argsort(test[:, -1])]

    # Scale
    train_scaled = train_s.copy()
    test_scaled = test_s.copy()

    features_train = train_scaled[:, 1:-2]
    features_test = test_scaled[:, 1:-2]

    scaler = StandardScaler().fit(features_train)

    features_train1 = scaler.transform(features_train)
    features_test1 = scaler.transform(features_test)

    train_scaled[:, 1:-2] = features_train1
    test_scaled[:, 1:-2] = features_test1

    if seq:
        # sequence for rnn
        x_train, y_train = sequencer(train_scaled, seq_len=seq_length)
        x_test, y_test = sequencer(test_scaled, seq_len=seq_length)

        return x_train, y_train, x_test, y_test

    return train_scaled[:, 1:-2].astype(np.float), train_scaled[:, -2].astype(np.float) \
        , test_scaled[:, 1:-2].astype(np.float), test_scaled[:, -2].astype(np.float)

In [68]:
import pandas as pd
import numpy as np

data = pd.read_csv('../../Data/Cleaned/Prior_FG_Present.csv')
df = data.copy()

In [69]:
df = df.iloc[:200, :]

In [70]:
df.head()

Unnamed: 0,prediction_day,CNBC,IBDinvestors,MarketWatch,SJosephBurns,SPDJIndices,VPatelFX,benzinga,bespokeinvest,breakoutstocks,...,x352,x353,x354,x355,x356,x357,x358,x359,x360,y
0,2018-06-14,1.762757,0.107444,-2.92354,-0.388096,1.332755,0.54277,-0.136163,0.812831,-0.087926,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
1,2018-06-15,2.39204,0.107444,-0.816843,-0.504146,0.443929,0.54277,-0.136163,2.989066,-0.087926,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
2,2018-06-18,-2.080261,-0.237079,0.387654,1.019944,0.923091,0.54277,-1.511802,2.989066,-0.087926,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
3,2018-06-19,-1.75151,0.92511,-5.348703,-0.049525,0.923091,0.54277,-1.511802,-0.665083,-0.087926,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,2018-06-20,2.663378,0.92511,1.62198,-0.049525,-0.059474,0.54277,-1.511802,1.379355,-0.087926,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0


In [71]:
train, test = train_test_split(df)

In [72]:
train.shape

(160, 382)

In [77]:
x1, x2, x3, x4, x5, x6 = processing_cv(train, test, test, seq=True, seq_length=3, fg = (False, 0))

In [78]:
x1.shape

(158, 3, 379)

In [82]:
## Make sure theyre sorted 
train = train[np.argsort(train[:,-1])]

from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits = 3)
print(tscv)

for train_index, test_index in tscv.split(train):
    x_train, x_val = train[train_index], train[test_index]
    
    x_train, y_train, x_val, y_val, _1, _2 = processing_cv(x_train,x_val, test, seq = True)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None)


In [83]:
x_train.shape

(116, 5, 379)

In [85]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [96]:
model = Sequential()
model.add(LSTM(128, input_shape=(x_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

In [97]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [98]:
history = model.fit(
    x_train, y_train,
    batch_size=32,
    epochs=50,
    validation_data=(x_val, y_val),
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
