In [None]:
#### Preamble ####
# Purpose: Model the data to predict BTC/USTD's moving direction
# Author: Jiazhou(Justin) Bi
# Date: 14 Nov 2024
# Contact: justin.bi@mail.utoronto.ca
# License: None
# Pre-requisites: see requirements.txt
# Any other information needed? None

# Loading the Test Data

In [3]:
import pandas as pd
df = pd.read_parquet('../data/01-raw_data/raw_data.parquet')

# RNN

In [4]:

# importing necessary packages
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
import matplotlib.pyplot as plt

In [5]:
# creating a copy of the original data to be used by RNN
df_RNN = df.drop(columns=['timestamp',])

# defining the target variable
df_RNN ['target'] = df_RNN['close'].shift(-1)
df_RNN.dropna(inplace=True) # dropping the last row as it is missing the target variable

# defining features and target
X = df_RNN[['close', 'open', 'high', 'low', 'volume']]
y = df_RNN['target']

# Scale the features and target
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_RNN)

# Create sequences for RNN (60 rows at the moment)
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i + sequence_length, :-1])  # Use all columns except the last one (target)
        y.append(data[i + sequence_length, -1])     # The target column is the last one
    return np.array(X), np.array(y)
sequence_length = 60
X, y = create_sequences(df_scaled, sequence_length)

# Split data into training and test sets
split_ratio = 0.7
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# defining the model
model = Sequential()
model.add(SimpleRNN(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(SimpleRNN(64, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')


  super().__init__(**kwargs)


In [6]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 9ms/step - loss: 2.7066e-04 - val_loss: 0.0022
Epoch 2/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m726s[0m 9ms/step - loss: 1.1640e-05 - val_loss: 0.0064
Epoch 3/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 9ms/step - loss: 6.6797e-06 - val_loss: 0.0061
Epoch 4/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m726s[0m 9ms/step - loss: 6.3167e-06 - val_loss: 0.0064
Epoch 5/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m729s[0m 9ms/step - loss: 5.9226e-06 - val_loss: 0.0081
Epoch 6/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m731s[0m 9ms/step - loss: 5.7484e-06 - val_loss: 0.0075
Epoch 7/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m732s[0m 9ms/step - loss: 5.9153e-06 - val_loss: 0.0062
Epoch 8/50
[1m83183/83183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m733s[0m 9ms/ste

In [None]:
# Evaluate the model
test_loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")

# Make predictions
predictions = model.predict(X_test)

dummy_column = np.zeros((X_test.shape[0], 1))
X_test_last_step = np.concatenate([X_test[:, -1, :-1], dummy_column], axis=1)

# Reconstruct predictions for inverse scaling
predicted_prices = scaler.inverse_transform(
    np.concatenate([X_test_last_step[:, :-1], predictions], axis=1)
)[:, -1]

actual_prices = scaler.inverse_transform(
    np.concatenate([X_test_last_step[:, :-1], y_test.reshape(-1, 1)], axis=1)
)[:, -1]

# plt.figure(figsize=(14, 5))
# plt.plot(actual_prices, color='blue', label='Actual BTC Price')
# plt.plot(predicted_prices, color='red', label='Predicted BTC Price')
# plt.xlabel('Time')
# plt.ylabel('BTC Price')
# plt.title('Actual vs Predicted BTC Price')
# plt.legend()
# plt.show()


[1m35650/35650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 3ms/step - loss: 0.0062
Test Loss: 0.005694008432328701
[1m35650/35650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 3ms/step


ValueError: operands could not be broadcast together with shapes (1140785,5) (6,) (1140785,5) 