In [None]:
from google.colab import drive
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from os import listdir
import pandas as pd
import numpy as np
import pickle
np.random.seed(7)

drive.mount('/content/drive/')

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  


Mounted at /content/drive/


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM,Dropout,Dense,Input 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger 
from tensorflow.keras.models import Model, Sequential 
from tensorflow.keras import optimizers

In [None]:
#indir = "/content/drive/My Drive/Data/Case 1"
!ls "/content/drive/My Drive/Data/Case 2"

'Combined_News_S&P.csv'		      reddit_with_OOB_sentiment.csv
'Combined_News_S&P.gsheet'	      reddit_with_wordDict_sentiment.csv
 RedditNews.csv			      reddit_with_wordDict_sentiment.gsheet
 RedditNews.gsheet		     'S&P500 data with label.csv'
 reddit_with_modified_sentiment.csv


In [None]:
def build_timeseries(mat, y_col_index):
    """
    Converts ndarray into timeseries format and supervised data format. Takes first TIME_STEPS
    number of rows as input and sets the TIME_STEPS+1th data as corresponding output and so on.
    :param mat: ndarray which holds the dataset
    :param y_col_index: index of column which acts as output
    :return: returns two ndarrays-- input and output in format suitable to feed
    to LSTM.
    """
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    #print(dim_0)
    dim_1 = mat.shape[1]
    #print(dim_1)
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    #print(x.shape)
    y = np.zeros((dim_0,))

    for i in tqdm_notebook(range(dim_0)):
        x[i] = mat.iloc[i:TIME_STEPS+i]
        y[i] = mat.iloc[TIME_STEPS+i, y_col_index]
#         if i < 10:
#           print(i,"-->", x[i,-1,:], y[i])
    print("length of time-series i/o",x.shape,y.shape)
    return x, y

In [None]:
def trim_dataset(mat,batch_size):
    """
    trims dataset to a size that's divisible by BATCH_SIZE
    """
    no_of_rows_drop = mat.shape[0]%batch_size
    if no_of_rows_drop > 0:
        return mat[:-no_of_rows_drop]
    else:
        return mat

In [None]:
def create_model():
    lstm_model = Sequential()
    # (batch_size, timesteps, data_dim)
    lstm_model.add(LSTM(100, batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2]),
                        dropout=0.0, recurrent_dropout=0.0, stateful=True, return_sequences=True,
                        kernel_initializer='random_uniform'))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(LSTM(60, dropout=0.0))
    lstm_model.add(Dropout(0.4))
    lstm_model.add(Dense(20,activation='relu'))
    lstm_model.add(Dense(1,activation='sigmoid'))
    optimizer = optimizers.RMSprop(lr=params["lr"])
    # optimizer = optimizers.SGD(lr=0.000001, decay=1e-6, momentum=0.9, nesterov=True)
    lstm_model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
    return lstm_model

In [None]:
params = {
    "batch_size": 20,  # 20<16<10, 25 was a bust
    "epochs": 300,
    "lr": 0.00010000,
    "time_steps": 10
}

TIME_STEPS = params["time_steps"]
BATCH_SIZE = params["batch_size"]

In [None]:
wsj_sentiment_sp = pd.read_csv('/content/drive/My Drive/Data/Case 1/wsj_OOBsent_s&p.csv')
wsj_sentiment_sp.head()

Unnamed: 0,Date,Open,Adj Close,Volume,avg'd sentiment,label
0,2008-08-08,1266.290039,1296.319946,4966810000,-0.04498,0
1,2008-08-11,1294.420044,1305.319946,5067310000,-0.03408,1
2,2008-08-12,1304.790039,1289.589966,4711290000,-0.06898,0
3,2008-08-13,1288.640015,1285.829956,4787600000,0.03524,0
4,2008-08-14,1282.109985,1292.930054,4064000000,-0.0362,1


Normalizing

In [None]:
sc = MinMaxScaler(feature_range = (0, 1))
open = wsj_sentiment_sp[['Open']]
open = sc.fit_transform(open)
open = list(open.flatten())

adj_close = wsj_sentiment_sp[['Adj Close']]
adj_close = sc.fit_transform(adj_close)
adj_close = list(adj_close.flatten())

volume = wsj_sentiment_sp[['Volume']]
volume = sc.fit_transform(volume)
volume = list(volume.flatten())

scaled_data = pd.DataFrame(
{"open" : open,
"adjusted_close" : adj_close,
"volume" : volume},
index = list(range(len(volume))))

sent = wsj_sentiment_sp[['avg\'d sentiment']]
label = wsj_sentiment_sp[['label']]

scaled_data = scaled_data.join(sent)
scaled_data = scaled_data.join(label)

print(scaled_data)

          open  adjusted_close    volume  avg'd sentiment  label
0     0.404533        0.426180  0.377885         -0.04498      0
1     0.423919        0.432369  0.387520         -0.03408      1
2     0.431065        0.421553  0.353390         -0.06898      0
3     0.419935        0.418967  0.360705          0.03524      0
4     0.415435        0.423849  0.291337         -0.03620      1
...        ...             ...       ...              ...    ...
1983  0.981703        0.935769  0.630074         -0.02984      0
1984  0.931837        0.910417  0.422407          0.07774      0
1985  0.914760        0.934862  0.322187          0.01842      1
1986  0.939583        0.958708  0.308376          0.00318      1
1987  0.960588        0.978024  0.344909         -0.04846      1

[1988 rows x 5 columns]


In [None]:
df_train, df_test = train_test_split(scaled_data, train_size=0.8, test_size=0.2, shuffle=False)
print("Train--Test size", len(df_train), len(df_test))

Train--Test size 1590 398


In [None]:
x_train = df_train.iloc[:,:]
print(x_train.iloc[10,4])
x_test = df_test.iloc[:,:]
#y_train = df_train.iloc[:,4]
#y_test = df_test.iloc[:,4]

1


In [None]:
x_t, y_t = build_timeseries(x_train, 4)

HBox(children=(FloatProgress(value=0.0, max=1580.0), HTML(value='')))


length of time-series i/o (1580, 10, 5) (1580,)


In [None]:
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
print("Batch trimmed size",x_t.shape, y_t.shape)

Batch trimmed size (1580, 10, 5) (1580,)


In [None]:
x_temp, y_temp = build_timeseries(x_test, 3)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

print("Test size", x_test_t.shape, y_test_t.shape, x_val.shape, y_val.shape)

HBox(children=(FloatProgress(value=0.0, max=388.0), HTML(value='')))


length of time-series i/o (388, 10, 5) (388,)
Test size (190, 10, 5) (190,) (190, 10, 5) (190,)


In [None]:
model = create_model()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=40, min_delta=0.0001)


history = model.fit(x_t, y_t, epochs=params["epochs"], verbose=2, batch_size=BATCH_SIZE,
                        shuffle=False, validation_data=(trim_dataset(x_val, BATCH_SIZE),
                        trim_dataset(y_val, BATCH_SIZE)), callbacks=[es])
    
#print("saving model...")
#pickle.dump(model, open("lstm_model", "wb"))

Epoch 1/300
79/79 - 4s - loss: 0.2486 - accuracy: 0.5399 - val_loss: 0.3371 - val_accuracy: 0.0000e+00
Epoch 2/300
79/79 - 1s - loss: 0.2480 - accuracy: 0.5551 - val_loss: 0.3211 - val_accuracy: 0.0000e+00
Epoch 3/300
79/79 - 1s - loss: 0.2486 - accuracy: 0.5532 - val_loss: 0.3285 - val_accuracy: 0.0000e+00
Epoch 4/300
79/79 - 1s - loss: 0.2479 - accuracy: 0.5551 - val_loss: 0.3296 - val_accuracy: 0.0000e+00
Epoch 5/300
79/79 - 1s - loss: 0.2475 - accuracy: 0.5551 - val_loss: 0.3297 - val_accuracy: 0.0000e+00
Epoch 6/300
79/79 - 1s - loss: 0.2481 - accuracy: 0.5551 - val_loss: 0.3238 - val_accuracy: 0.0000e+00
Epoch 7/300
79/79 - 1s - loss: 0.2474 - accuracy: 0.5551 - val_loss: 0.3317 - val_accuracy: 0.0000e+00
Epoch 8/300
79/79 - 1s - loss: 0.2480 - accuracy: 0.5551 - val_loss: 0.3157 - val_accuracy: 0.0000e+00
Epoch 9/300
79/79 - 1s - loss: 0.2474 - accuracy: 0.5551 - val_loss: 0.3245 - val_accuracy: 0.0000e+00
Epoch 10/300
79/79 - 1s - loss: 0.2472 - accuracy: 0.5551 - val_loss: 0.3

KeyboardInterrupt: ignored