In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error 
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
%matplotlib inline

In [2]:
#Testing potential models

LSTM model using uber reddit comments sentimental data plus closing prices

In [3]:
uber_sent_df = pd.read_csv('/Users/krist/OneDrive/Desktop/P2G2-ProjectGit/Notebooks/models/resources/UBER_sent_copy.csv', index_col=0)
uber_sent_df.head()

Unnamed: 0,sec_compound_sentiment,sec_positive_sentiment,sec_neutral sentiment,sec_negative_sentiment,stockmarket_compound_sentiment,stockmarket_positive_sentiment,stockmarket_neutral_sentiment,stockmarket_negative_sentiment,volume,close
2019-05-10,1.0,0.111,0.846,0.042,0.06023,0.085886,0.841045,0.073045,189836990.0,41.57
2019-05-11,1.0,0.111,0.846,0.042,-0.114408,0.024583,0.885333,0.09,189836990.0,41.57
2019-05-12,1.0,0.111,0.846,0.042,0.069736,0.098,0.844818,0.057091,189836990.0,41.57
2019-05-13,1.0,0.111,0.846,0.042,0.006767,0.069667,0.877,0.053333,79478203.0,37.1
2019-05-14,1.0,0.111,0.846,0.042,0.006767,0.069667,0.877,0.053333,46661187.0,39.96


In [6]:
train_df = uber_sent_df.copy()
features = ['sec_compound_sentiment','sec_positive_sentiment', 'sec_neutral sentiment', 'sec_negative_sentiment', 'stockmarket_compound_sentiment', 'stockmarket_positive_sentiment', 'stockmarket_neutral_sentiment', 'stockmarket_negative_sentiment', 'volume', 'close']
data = pd.DataFrame(train_df)
data_filtered = data[features]

# We add a prediction column and set dummy values to prepare the data for scaling
data_filtered_ext = data_filtered.copy()
data_filtered_ext['Prediction'] = data_filtered_ext['close']

# Print the tail of the dataframe
data_filtered_ext.tail()

Unnamed: 0,sec_compound_sentiment,sec_positive_sentiment,sec_neutral sentiment,sec_negative_sentiment,stockmarket_compound_sentiment,stockmarket_positive_sentiment,stockmarket_neutral_sentiment,stockmarket_negative_sentiment,volume,close,Prediction
2022-05-27,1.0,0.126,0.833,0.04,0.3098,0.051,0.859,0.09,29620958.0,23.67,23.67
2022-05-28,1.0,0.126,0.833,0.04,0.2944,0.131,0.803,0.066,29620958.0,23.67,23.67
2022-05-29,1.0,0.126,0.833,0.04,0.0,0.0,1.0,0.0,29620958.0,23.67,23.67
2022-05-31,1.0,0.126,0.833,0.04,0.0,0.0,1.0,0.0,33048952.0,23.2,23.2
2022-06-01,1.0,0.126,0.833,0.04,0.0,0.0,1.0,0.0,26285752.0,22.94,22.94


In [8]:
from sklearn.preprocessing import MinMaxScaler
# Get the number of rows in the data
nrows = data_filtered.shape[0]

# Convert the data to numpy values
np_data_unscaled = np.array(data_filtered)
np_data = np.reshape(np_data_unscaled, (nrows, -1))
print(np_data.shape)

# Transform the data by scaling each feature to a range between 0 and 1
scaler = MinMaxScaler()
np_data_scaled = scaler.fit_transform(np_data_unscaled)

# Creating a separate scaler that works on a single column for scaling predictions
scaler_pred = MinMaxScaler()
df_Close = pd.DataFrame(data_filtered_ext['close'])
np_Close_scaled = scaler_pred.fit_transform(df_Close)

(1118, 10)


In [10]:
import math
# Set the sequence length - this is the timeframe used to make a single prediction
sequence_length = 50

# Prediction Index
index_Close = data.columns.get_loc("close")

# Split the training data into train and train data sets
# As a first step, we get the number of rows to train the model on 80% of the data 
train_data_len = math.ceil(np_data_scaled.shape[0] * 0.8)

# Create the training and test data
train_data = np_data_scaled[0:train_data_len, :]
test_data = np_data_scaled[train_data_len - sequence_length:, :]

# The RNN needs data with the format of [samples, time steps, features]
# Here, we create N samples, sequence_length time steps per sample, and 6 features
def partition_dataset(sequence_length, data):
    x, y = [], []
    data_len = data.shape[0]
    for i in range(sequence_length, data_len):
        x.append(data[i-sequence_length:i,:]) #contains sequence_length values 0-sequence_length * columsn
        y.append(data[i, index_Close]) #contains the prediction values for validation,  for single-step prediction
    
    # Convert the x and y to numpy arrays
    x = np.array(x)
    y = np.array(y)
    return x, y

# Generate training data and test data
x_train, y_train = partition_dataset(sequence_length, train_data)
x_test, y_test = partition_dataset(sequence_length, test_data)

# Print the shapes: the result is: (rows, training_sequence, features) (prediction value, )
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# Validate that the prediction value and the input match up
# The last close price of the second input sample should equal the first prediction value
print(x_train[1][sequence_length-1][index_Close])
print(y_train[0])

(845, 50, 10) (845,)
(223, 50, 10) (223,)
0.652605459057072
0.652605459057072


In [12]:
# Configure the neural network model
model = Sequential()

# Model with n_neurons = inputshape Timestamps, each with x_train.shape[2] variables
n_neurons = x_train.shape[1] * x_train.shape[2]
print(n_neurons, x_train.shape[1], x_train.shape[2])
model.add(LSTM(n_neurons, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2]))) 
model.add(LSTM(n_neurons, return_sequences=False))
model.add(Dense(5))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mse')

500 50 10


In [16]:
# Training the model
epochs = 10
batch_size = 1
early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
history = model.fit(x_train, y_train, 
                    batch_size=batch_size, 
                    epochs=epochs,
                    validation_data=(x_test, y_test)
                   )


Epoch 1/10

KeyboardInterrupt: 