# Define Dynamic Parameters

In [155]:
file_no = 0
my_seed = 41291
lstm_units = 100
no_epochs = 150
shuffle_bool = True

# Import Packages

In [156]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
from sklearn.model_selection import  train_test_split
import tensorflow as tf
import ntpath
import kerastuner as kt
import csv 


tf.random.set_seed(my_seed)
np.random.seed(my_seed)

# Import CSVs

In [157]:
ROOT_DIR = os.path.abspath(os.curdir)
path = ROOT_DIR + '\\..\\..\\Datasets\\processed\\lstm'
print(path)

csvfiles = glob.glob(path + "/*.csv")
dfs = []

for file in csvfiles:
    df = pd.read_csv(file)
    df.name = (ntpath.basename(file)).split(".",1)[0]
    df.filename = ntpath.basename(file)
    dfs.append(df)

df = dfs[file_no]
print("Stock: ")
print(df.name)
print("File: ")
print(df.filename)
df_name = df.name


C:\FYP - Luke Bezzina\Code\mlpLearning\..\..\Datasets\processed\lstm
Stock: 
PKG_3dtrend_google_trd
File: 
PKG_3dtrend_google_trd.csv


# Encoding Out Variable

In [158]:
price_classification = ['Positive', 'Neutral', 'Negative']

price_direction = df.PriceDirection
one_hot_dummies = pd.get_dummies(price_direction)
one_hot_dummies = one_hot_dummies.reindex\
    (columns=price_classification, fill_value=0)

df = df.drop(columns=['PriceDirection'])
df = pd.concat([df, one_hot_dummies], axis=1)

print(df)


            Date  OpenPrice  ClosePrice  Trend  VolumeTrend  Volatility  \
0     06/01/2017      86.37       85.33 -0.165     0.564738    0.583800   
1     09/01/2017      85.37       87.20 -0.415     0.072317    0.765027   
2     10/01/2017      86.90       87.77 -1.220    -0.258524    1.042188   
3     11/01/2017      87.70       87.98 -0.390    -0.072888    0.329545   
4     12/01/2017      88.06       88.52 -0.375     0.334642    0.315911   
...          ...        ...         ...    ...          ...         ...   
1000  22/12/2020     133.32      133.77  0.940     0.043060    0.770757   
1001  23/12/2020     133.62      133.81  0.525     0.575929    0.504667   
1002  24/12/2020     134.12      135.73 -0.980    -0.242168    0.914671   
1003  28/12/2020     136.27      136.10 -1.145     0.483467    1.003737   
1004  29/12/2020     137.94      136.78 -0.525     0.603872    0.434844   

      pandemic  covid  covid-19  coronavirus  ...  deaths  restrictions  \
0         0.54   0.00   

# Splitting dataset in dependent and independent variables

In [159]:
#data = np.array(df.iloc[:, :].values)
data = np.array(df.iloc[:, 1:].values, dtype=np.float)

# Splitting dataset (training + testing)

In [160]:
data_train, data_test= train_test_split(data, test_size=0.1, shuffle=False, random_state=32)

print("Train")
print(data_train)
print("Test")
print(data_test)


Train
[[ 8.637e+01  8.533e+01 -1.650e-01 ...  1.000e+00  0.000e+00  0.000e+00]
 [ 8.537e+01  8.720e+01 -4.150e-01 ...  1.000e+00  0.000e+00  0.000e+00]
 [ 8.690e+01  8.777e+01 -1.220e+00 ...  0.000e+00  1.000e+00  0.000e+00]
 ...
 [ 9.629e+01  9.586e+01  4.500e-02 ...  0.000e+00  1.000e+00  0.000e+00]
 [ 9.625e+01  9.501e+01  1.800e-01 ...  1.000e+00  0.000e+00  0.000e+00]
 [ 9.217e+01  9.547e+01  1.950e-01 ...  1.000e+00  0.000e+00  0.000e+00]]
Test
[[ 95.69   97.56   -1.275 ...   1.      0.      0.   ]
 [ 99.44  100.     -2.265 ...   0.      0.      1.   ]
 [100.62   99.4    -0.92  ...   0.      1.      0.   ]
 ...
 [134.12  135.73   -0.98  ...   1.      0.      0.   ]
 [136.27  136.1    -1.145 ...   1.      0.      0.   ]
 [137.94  136.78   -0.525 ...   0.      1.      0.   ]]


# Split Dataset in LSTM sequence

In [161]:
# split a multivariate sequence into samples
def split_sequences(dataset, n):
	x, y = list(), list()

	for i in range(len(dataset)):
		# find the end of this pattern
		end_ix = i + n
		# last row of dataset considered should allow n number of rows to allocate sequence
		if end_ix > len(dataset):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = dataset[i:end_ix, :-3], dataset[end_ix-1, -3:]
		x.append(seq_x)
		y.append(seq_y)
	return np.array(x), np.array(y)

no_steps = 10
x_train, y_train = split_sequences(data_train, no_steps)
x_test, y_test = split_sequences(data_test, no_steps)

no_features = x_train.shape[2]
print(x_train.shape)

(895, 10, 20)


# Building LSTM model

In [162]:
# define model
lstm = tf.keras.models.Sequential()
lstm.add(tf.keras.layers.LSTM(units=lstm_units, activation='relu', input_shape=(no_steps, no_features)))
lstm.add(tf.keras.layers.Dense(3, activation='sigmoid'))

optimizer = tf.keras.optimizers.Adam()

lstm.compile(optimizer = optimizer, loss='binary_crossentropy',metrics = ['accuracy'])
lstm.fit(x_train, y_train, batch_size=32, epochs=no_epochs, shuffle=shuffle_bool)

Train on 895 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


<tensorflow.python.keras.callbacks.History at 0x1d331b6f988>

# Predicting Test Set Results

In [163]:
y_pred = lstm.predict(x_test)
# boolean result of whether predicted probability is larger than 0.5
# result > 0.5 = 1, result <= 0.5 = 0
y_pred_binary = []
for i in y_pred:
    max = 0
    elem = 0
    for iter, x in enumerate(i):
        if x > max:
            max = x
            elem = iter
    rec = [False, False, False]
    rec[elem] = True
    y_pred_binary.append(rec)

y_pred_binary = np.array(y_pred_binary)

# Obtaining Confusion Matrix and Accuracy Score for predictions

In [164]:
%%capture cap

# To evaluate accuracy, a vector of labels is needed
# np.argmax(x, axis=1) - to output correct labels
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

# Confusion Matrix: Columns = 0, 1 (Predicted Label)
# Confusion Matrix: Rows = 0, 1 (Actual Label)
cm = multilabel_confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred_binary, axis=1))
print(cm)

print()
print("Accuracy Score: ")
accuracy_score = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred_binary, axis=1))
print(accuracy_score)

[[[ 7 29]
  [10 46]]

 [[85  0]
  [ 7  0]]

 [[52 11]
  [23  6]]]

Accuracy Score: 
0.5652173913043478


# Output Results

In [1]:
output_file_name = df_name + '_' + str(lstm_units) + 'units_' + str(no_epochs) + 'epc_' + str(my_seed) 
data = [output_file_name, accuracy_score]

with open(ROOT_DIR+'\\lstm_results\\results_shuffle_test.csv', 'a', newline="") as f:
    writer = csv.writer(f)
    writer.writerow(data)

with open(ROOT_DIR+'\\lstm_results\\shuffle_test\\'+ output_file_name + str(shuffle_bool) + '.txt', 'w') as f:
    f.write(cap.stdout)




NameError: name 'df_name' is not defined