# Define Dynamic Parameters

In [1]:
file_no = 1 #8
my_seed = 41291

# Import Packages

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
from sklearn.model_selection import  train_test_split
import tensorflow as tf
import ntpath
import kerastuner as kt


tf.random.set_seed(my_seed)
np.random.seed(my_seed)

# Import CSVs

In [3]:
ROOT_DIR = os.path.abspath(os.curdir)
path = ROOT_DIR + '\\..\\..\\Datasets\\processed\\lstm'
print(path)

csvfiles = glob.glob(path + "/*.csv")
dfs = []

for file in csvfiles:
    df = pd.read_csv(file)
    df.name = (ntpath.basename(file)).split(".",1)[0]
    df.filename = ntpath.basename(file)
    dfs.append(df)

df = dfs[file_no]
print("Stock: ")
print(df.name)
print("File: ")
print(df.filename)
df_name = df.name


C:\FYP - Luke Bezzina\Code\mlpLearning\..\..\Datasets\processed\lstm
Stock: 
AAL_3dtrend
File: 
AAL_3dtrend.csv


# Encoding Out Variable

In [4]:
price_classification = ['Positive', 'Neutral', 'Negative']

price_direction = df.PriceDirection
one_hot_dummies = pd.get_dummies(price_direction)
one_hot_dummies = one_hot_dummies.reindex\
    (columns=price_classification, fill_value=0)

df = df.drop(columns=['PriceDirection'])
df = pd.concat([df, one_hot_dummies], axis=1)

print(df)


            Date  OpenPrice  ClosePrice  Trend  VolumeTrend  Volatility  \
0     2017-01-06      46.52       45.89  0.205    -0.000756    0.330690   
1     2017-01-09      45.85       46.21  0.245    -0.103936    0.333100   
2     2017-01-10      46.01       47.08 -0.595     0.178013    0.502814   
3     2017-01-11      47.00       48.48 -1.135    -0.016378    0.935105   
4     2017-01-12      49.29       48.64 -0.780    -0.150590    0.700730   
...          ...        ...         ...    ...          ...         ...   
997   2020-12-22      15.70       16.10  0.350    -0.324481    0.287170   
998   2020-12-23      16.35       15.48  0.515    -0.006646    0.423399   
999   2020-12-24      15.52       15.89  0.105     0.164744    0.257466   
1000  2020-12-28      16.04       15.66 -0.090     0.254404    0.167796   
1001  2020-12-29      15.96       16.06 -0.085     0.009973    0.163911   

      Positive  Neutral  Negative  
0            1        0         0  
1            1        0    

# Splitting dataset in dependent and independent variables

In [5]:
#data = np.array(df.iloc[:, :].values)
data = np.array(df.iloc[:, 1:].values, dtype=np.float)

# Splitting dataset (training + testing)

In [6]:
data_train, data_test= train_test_split(data, test_size=0.1, shuffle=False, random_state=32)

print("Train")
print(data_train)
print("Test")
print(data_test)


Train
[[46.52  45.89   0.205 ...  1.     0.     0.   ]
 [45.85  46.21   0.245 ...  1.     0.     0.   ]
 [46.01  47.08  -0.595 ...  1.     0.     0.   ]
 ...
 [11.16  11.12   0.14  ...  1.     0.     0.   ]
 [11.03  11.08   0.05  ...  1.     0.     0.   ]
 [11.23  11.47  -0.175 ...  1.     0.     0.   ]]
Test
[[ 1.18500000e+01  1.25600000e+01 -7.40000000e-01 -6.79461128e-01
   6.26329697e-01  1.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.30000000e+01  1.30400000e+01 -7.85000000e-01 -7.92057052e-01
   6.56878139e-01  1.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.28600000e+01  1.30300000e+01 -2.35000000e-01  5.19874883e-01
   2.23954360e-01  1.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.33000000e+01  1.40000000e+01 -4.80000000e-01  4.29902145e-02
   4.54923681e-01  0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 1.50000000e+01  1.37300000e+01 -3.50000000e-01 -2.52050971e-01
   4.08765078e-01  0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 1.40000000e+01  1.354

# Split Dataset in LSTM sequence

In [7]:
# split a multivariate sequence into samples
def split_sequences(dataset, n):
	x, y = list(), list()

	for i in range(len(dataset)):
		# find the end of this pattern
		end_ix = i + n
		# last row of dataset considered should allow n number of rows to allocate sequence
		if end_ix > len(dataset):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = dataset[i:end_ix, :-3], dataset[end_ix-1, -3:]
		x.append(seq_x)
		y.append(seq_y)
	return np.array(x), np.array(y)

no_steps = 10
x_train, y_train = split_sequences(data_train, no_steps)
x_test, y_test = split_sequences(data_test, no_steps)

no_features = x_train.shape[2]
print(x_train.shape)

(892, 10, 5)


# Building LSTM model

In [8]:
# define model
lstm = tf.keras.models.Sequential()
lstm.add(tf.keras.layers.LSTM(units=100, activation='relu', input_shape=(no_steps, no_features)))
lstm.add(tf.keras.layers.Dense(3, activation='sigmoid'))

optimizer = tf.keras.optimizers.Adam()

lstm.compile(optimizer = optimizer, loss='binary_crossentropy',metrics = ['accuracy'])
lstm.fit(x_train, y_train, batch_size=32, epochs=150, shuffle=True)

Train on 892 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150


<tensorflow.python.keras.callbacks.History at 0x1723c807cc8>

# Predicting Test Set Results

In [9]:
y_pred = lstm.predict(x_test)
# boolean result of whether predicted probability is larger than 0.5
# result > 0.5 = 1, result <= 0.5 = 0
y_pred_binary = []
for i in y_pred:
    max = 0
    elem = 0
    for iter, x in enumerate(i):
        if x > max:
            max = x
            elem = iter
    rec = [False, False, False]
    rec[elem] = True
    y_pred_binary.append(rec)

y_pred_binary = np.array(y_pred_binary)

# Obtaining Confusion Matrix and Accuracy Score for predictions

In [10]:
# %%capture cap

# To evaluate accuracy, a vector of labels is needed
# np.argmax(x, axis=1) - to output correct labels
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

# Confusion Matrix: Columns = 0, 1 (Predicted Label)
# Confusion Matrix: Rows = 0, 1 (Actual Label)
cm = multilabel_confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred_binary, axis=1))
print(cm)

print()
print("Accuracy Score: ")
print(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred_binary, axis=1)))

[[[28 15]
  [25 24]]

 [[87  0]
  [ 5  0]]

 [[27 27]
  [12 26]]]

Accuracy Score: 
0.5434782608695652


# Output Results

In [11]:
#with open(ROOT_DIR+'\\classification_results\\'+df_name + '_' + str(my_seed) +'.txt', 'w') as f:
#    f.write(cap.stdout)


