In [1]:
#python
#Written by Jude Saloum

from gettext import npgettext #might be superfluous
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import datasets, layers, models
import pandas as pd
import numpy as np
from keras.layers import LSTM, Dense
#from keras.layers import Sequential
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, Normalizer
from Preprocessing_tools import NDScaler
import matplotlib.pyplot as plt
split_percent = 0.75


2023-02-02 20:21:25.931667: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-02 20:21:26.138801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /app/lib
2023-02-02 20:21:26.138821: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-02 20:21:26.960426: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No s

In [2]:
def comparison_preproccess(junk) -> list:
    if type(junk) == type(1.0): return np.asarray(junk)
    '''
    the input is supposed to look like "[1,2,3,4]" which is a string
    the function takes this and outputs [1,2,3,4] which is an array of floats

    to do this the function first removes the first and last character, effectively removing the brackets ("[1,2,3,4]" -> "1,2,3,4")
    nan as well as inf appear in the data, we convert it to a value we want it to be (nan = 0, and inf = 99)
    so we replace all values of "nan" and "inf in the string
    it then splits the string by the commas ("1,2,3,4" -> ["1", "2", "3", "4"]  )
    finally, it converts each string element to it's numerical equivalent as a float
    '''
    arr_without_brackets = junk[1:-1]  #removes first and last character
    arr_without_brackets = arr_without_brackets.replace("nan", "0")
    arr_without_brackets = arr_without_brackets.replace('inf', '99')
    arr = arr_without_brackets.split(",") #splits by commas
    for i,e  in enumerate(arr): #converts to numerical equivalent and saves to the array
        try:
            arr[i] = float(e)
        except:
            raise ValueError(f'Could not convert value {e} to float')
    
    return np.asarray(arr)

In [3]:
def get_numpy(csv_name, label, size):
    '''
    Since arrays are stored inside the pandas dataframe cells, to_numpy() is not going to work well
    Instead, this function iterates through the cells, takes them as string items, then converts them into a numpy array
    '''
    df = pd.read_csv(csv_name, usecols=[
        label], sep='\t') #read csv with usecols parameter

    arr = np.ones((size,),dtype=float) #this is to make an array with shape (1,16) so that concatenation can occur

    for i in range(len(df)):
        
        ind = str(df.loc[i].item()) #take the object out of the cell and take as item
        
        #remove the brackets as np.fromstring() does not work with brackets
        ind = ind.replace('[','')
        ind = ind.replace(']','')
        ind = ind.replace(', ', '|')
        
        arr = np.column_stack((arr, np.fromstring(ind, dtype=float, sep='|'))) #concatenate
    arr = arr.swapaxes(0,1)
    arr = np.delete(arr, 0, 0) #delete first index

    return arr

In [4]:
#function to create LSTM
def create_LSTM():
    inputs = layers.Input(shape=(7,22))
    layer1 = LSTM(128, return_sequences=True, activation="relu")(inputs)
    layer2 = LSTM(128, return_sequences=True, activation="relu")(layer1)
    layer3 = LSTM(128, return_sequences=True, activation="relu")(layer2)
    layer4 = LSTM(128, return_sequences=True, activation="relu")(layer3)
    layer5 = tf.keras.layers.Bidirectional(LSTM(658, activation = 'relu', return_sequences=True))(layer4)

    outputs = layers.Dense(1)(layer5)

    return keras.Model(inputs=inputs, outputs=outputs)

In [5]:
'''
the keras.layers.LSTM() object takes inputs in the form of 3D tensor with shape [batch, timesteps, feature].
Thus, the features (statistics and various types of prices etc.) must be reshaped to be in the third axis/dimension
of the tensor, and append the timesteps (the days D1-D7) to the second axis/dimension
Firstly, the prices will be appended to the sentiment features in the features axis, which will now be in the second axis,
then, the tensor will be reshaped to be 3D, before the second and third axis is swapped
this process will occur for the rest of the days before being appended as timesteps
'''
dataset = get_numpy("IDIDIT.csv", "Lead Paragraph D1", 16) #start with first timestep of sentiment statistics
dataset = np.concatenate((dataset, get_numpy("IDIDIT.csv", "Prices D1", 6)), axis=1) #concatenate prices to sentiment
dataset = dataset.reshape(-1,22,1) #reshape to be 3D
dataset = dataset.swapaxes(1, 2) #swap axis for tensor to be in the correct LSTM format
#print(dataset.shape)
#print(get_numpy("IDIDIT.csv", "Prices D1", 6).shape)

#append rest of timesteps
for i in range(2,8):
    col = get_numpy("IDIDIT.csv", "Lead Paragraph D" + str(i), 16)
    col = np.concatenate((col, get_numpy("IDIDIT.csv", "Prices D" + str(i), 6)), axis=1)
    col = col.reshape(-1,22,1)
    col = col.swapaxes(1, 2)

    dataset = np.concatenate((dataset, col), axis=1)

print(dataset.shape)

(354, 7, 22)


In [6]:
#start preprocessing
norm = NDScaler() #class to normalize data
le = LabelEncoder()

norm.fit(dataset)
normalized_dataset = norm.transform(dataset)

comparison_df = pd.read_csv('IDIDIT.csv', sep = '\t')
comparison_df = comparison_df['average of the next 4'].to_numpy()

comparison_norm = NDScaler()
labels = comparison_norm.fit_transform(comparison_df.reshape(-1, 1))

In [7]:
#split dataset into testing and training (75/25)
split_data = int(len(normalized_dataset) * split_percent)
train_X = normalized_dataset[:split_data]
test_X = normalized_dataset[split_data:]

split_labels = int(len(labels) * split_percent)
train_Y = labels[:split_labels]
test_Y = labels[split_labels:]

In [8]:
#create model, start fitting and training it
model = create_LSTM()
model.compile(loss='mse' ,optimizer='adam', metrics='accuracy')
print(model.summary())
model.fit(train_X, train_Y, epochs=500)
model.evaluate(test_X, test_Y)

2023-02-02 20:21:29.107515: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-02 20:21:29.107666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /app/lib
2023-02-02 20:21:29.107705: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /app/lib
2023-02-02 20:21:29.107737: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file o

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 7, 22)]           0         
                                                                 
 lstm (LSTM)                 (None, 7, 128)            77312     
                                                                 
 lstm_1 (LSTM)               (None, 7, 128)            131584    
                                                                 
 lstm_2 (LSTM)               (None, 7, 128)            131584    
                                                                 
 lstm_3 (LSTM)               (None, 7, 128)            131584    
                                                                 
 bidirectional (Bidirectiona  (None, 1316)             4142768   
 l)                                                              
                                                             

[0.03050409071147442, 0.01123595517128706]

In [9]:
print(test_X[5])

[[3.44543965e-01 3.92111671e-01 0.00000000e+00 9.84722488e-01
  1.69219659e-38 3.42199908e-31 1.97981314e-39 1.93448447e-01
  1.93554435e-01 1.93342459e-01 5.57850753e-01 5.57850753e-01
  5.45112323e-01 7.21769070e-01 6.51908798e-01 8.34780325e-01
  1.76825266e-01 1.67245168e-01 1.72435677e-01 1.85356272e-01
  1.88635124e-01 3.07469952e-01]
 [8.85638357e-01 7.79707975e-01 1.32980539e-02 9.85193649e-01
  6.62675316e-36 2.36781515e-27 2.57495042e-34 3.71839939e-01
  4.04297598e-01 3.39382280e-01 7.03867038e-01 7.03867038e-01
  9.74216269e-01 8.91880118e-01 8.82559336e-01 8.95927841e-01
  1.56802830e-01 1.71447140e-01 1.87175999e-01 1.85496087e-01
  1.88821351e-01 2.27672443e-01]
 [8.10422687e-01 6.63585391e-01 1.00000000e+00 9.83281225e-01
  2.70797540e-37 2.30917749e-29 5.86264618e-37 4.25226573e-01
  4.25226573e-01 4.25226573e-01 7.76129411e-01 7.76129411e-01
  9.45817038e-01 8.23994372e-01 8.83396796e-01 9.06914891e-01
  1.85433130e-01 1.79823478e-01 1.72112441e-01 1.81535204e-01
  1.

In [10]:
predictions = model.predict(test_X)
predictions[:,6,0]



IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [None]:
test_X.shape[0]

89

In [None]:
evaluation_df = pd.read_csv('IDIDIT.csv', sep = '\t')
evaluation_df['average of the next 4'] = evaluation_df['average of the next 4'].map(comparison_preproccess)
evaluation_df['Prices D7'] = evaluation_df['Prices D7'].map(comparison_preproccess)
last_day = evaluation_df['Prices D7']

lcol = 'Prices D7'
a = 'average of the next 4'

lcol = evaluation_df[lcol]
a  = evaluation_df[a]

n = len(evaluation_df['comparison']) - 1
for i in range(n):
    av = a.loc[i] 
    l = lcol.loc[i][0]
    c = av > l
    print(f'the comparison of {av} > {l} is {c}')
    evaluation_df['comparison'].loc[i] = c
    last_day.loc[i] = l

last_day = last_day.to_numpy()
last_day = last_day[split_labels:]
print(last_day.shape)

the comparison of 316.8274917602539 > 334.8299865722656 is False
the comparison of 313.8999938964844 > 325.8599853515625 is False
the comparison of 312.98499298095703 > 313.1499938964844 is False
the comparison of 312.79249572753906 > 314.1499938964844 is False
the comparison of 314.1725006103516 > 314.1499938964844 is True
the comparison of 315.75250244140625 > 314.1499938964844 is True
the comparison of 314.4425048828125 > 309.489990234375 is True
the comparison of 312.1600036621094 > 313.3800048828125 is False
the comparison of 308.3050003051758 > 319.6700134277344 is False
the comparison of 304.25 > 320.4700012207031 is False
the comparison of 304.2050018310547 > 304.25 is False
the comparison of 304.7150039672852 > 304.25 is True
the comparison of 305.92000579833984 > 304.25 is True
the comparison of 305.5300064086914 > 304.25 is True
the comparison of 305.1850051879883 > 304.07000732421875 is True
the comparison of 304.2850036621094 > 306.2900085449219 is False
the comparison of 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df['comparison'].loc[i] = c
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_day.loc[i] = l


In [None]:
last_day = last_day/1000
evaluations = []
for i in range(last_day.shape[0]):
    if((predictions[i] > last_day[i]).any()):
        evaluations.append(True)
    else:
        evaluations.append(False)

In [None]:
evaluation_df = evaluation_df['comparison'].to_numpy()
evaluation_df = list(evaluation_df)

In [None]:
print(predictions)
print(last_day)

[[[ 0.21540558]
  [ 0.2015701 ]
  [ 0.2074227 ]
  [ 0.21104312]
  [ 0.21113986]
  [ 0.21224985]
  [ 0.21278426]]

 [[ 0.23799849]
  [ 0.25231457]
  [ 0.24012932]
  [ 0.23169255]
  [ 0.22656786]
  [ 0.22497803]
  [ 0.22361034]]

 [[ 0.31401494]
  [ 0.25935182]
  [ 0.22906035]
  [ 0.22387731]
  [ 0.22813496]
  [ 0.23015505]
  [ 0.23032802]]

 [[ 0.18357942]
  [ 0.1754615 ]
  [ 0.18223047]
  [ 0.19238219]
  [ 0.19995603]
  [ 0.20318931]
  [ 0.20328376]]

 [[ 0.2528281 ]
  [ 0.2391387 ]
  [ 0.2347542 ]
  [ 0.22945231]
  [ 0.22676373]
  [ 0.22479045]
  [ 0.22250906]]

 [[ 0.26247403]
  [ 0.25641477]
  [ 0.22958282]
  [ 0.2200568 ]
  [ 0.21615988]
  [ 0.21445578]
  [ 0.21330804]]

 [[ 0.26090184]
  [ 0.21374142]
  [ 0.20088845]
  [ 0.19911969]
  [ 0.19982779]
  [ 0.20146975]
  [ 0.20261574]]

 [[ 0.21378875]
  [ 0.2111615 ]
  [ 0.20823258]
  [ 0.2062617 ]
  [ 0.20598537]
  [ 0.20758742]
  [ 0.21024   ]]

 [[ 0.20726505]
  [ 0.20046946]
  [ 0.1964787 ]
  [ 0.19402537]
  [ 0.19538385]
  [ 0.19

In [None]:
number_correct = 0
for i in range(len(evaluations)):
    if(evaluations[i] == evaluation_df[i]):
        number_correct += 1

acc = number_correct / len(evaluations)


In [None]:
print(acc)

0.651685393258427


In [None]:
model.save('/home/judesaloum/StockMarketBot/ML_Prediction')



INFO:tensorflow:Assets written to: /home/judesaloum/StockMarketBot/ML_Prediction/assets


INFO:tensorflow:Assets written to: /home/judesaloum/StockMarketBot/ML_Prediction/assets
