In [29]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
technical_indicators_scaled = pd.read_csv('/content/drive/My Drive/technical_indicators_scaled.csv', index_col=0)
ticker_data = pd.read_csv('/content/drive/My Drive/ticker_data.csv', index_col=0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
# Define a function to label the data based on the fixed number of hours (d)
def label_data_fixed_days(row_index, d, close_prices, binary=True):
    if row_index + d >= len(close_prices):
        return None
    current_price = close_prices.iloc[row_index]
    future_price = close_prices.iloc[row_index + d]
    if binary:
        return 1 if future_price > current_price else 0
    else:
        return 1 if future_price > current_price else -1


In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

d_values = list(range(1, 15))  # Test different values of d
best_d, best_model, best_accuracy, best_report = None, None, 0, None

for d in d_values:
    # Label the data using the current value of d
    technical_indicators_scaled['Label'] = 0
    for idx in range(len(technical_indicators_scaled)):
        binary_label = label_data_fixed_days(idx, d, ticker_data['Close'], binary=True)
        if binary_label is not None:
            technical_indicators_scaled.loc[technical_indicators_scaled.index[idx], 'Label'] = binary_label

    y = technical_indicators_scaled['Label'].values
    X = technical_indicators_scaled.drop(columns=['Label'])

    # Remove the last d rows
    X = X.iloc[:-d]
    y = y[:-d]

    # Reshape the input data into a 3D array as required by LSTM
    n_features = X.shape[1]
    X = X.to_numpy().reshape(-1, 1, n_features)

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(units=150, activation='tanh', input_shape=(1, n_features)))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    tscv = TimeSeriesSplit(n_splits=5)
    accuracy_scores = []
    y_true_all = []
    y_pred_all = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        model.fit(X_train, y_train, epochs=200, batch_size=64, verbose=0)

        # Evaluate the model
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)

        y_true_all.extend(y_test)
        y_pred_all.extend(y_pred.flatten())

    # Calculate the average accuracy
    average_accuracy = np.mean(accuracy_scores)

    if average_accuracy > best_accuracy:
        best_d = d
        best_accuracy = average_accuracy
        best_model = model
        best_report = classification_report(y_true_all, y_pred_all)

    print(f"d = {d}, Average accuracy: {average_accuracy}")

print(f"Best d value: {best_d}, with average accuracy: {best_accuracy}")
print("\nClassification report:")
print(best_report)


d = 1, Average accuracy: 0.5060532687651332
d = 2, Average accuracy: 0.5234866828087167
d = 3, Average accuracy: 0.5406779661016948
d = 4, Average accuracy: 0.5454545454545455
d = 5, Average accuracy: 0.5507878787878788
d = 6, Average accuracy: 0.5578181818181818
d = 7, Average accuracy: 0.5658181818181818
d = 8, Average accuracy: 0.5612121212121212
d = 9, Average accuracy: 0.5735757575757575
d = 10, Average accuracy: 0.5771844660194174
d = 11, Average accuracy: 0.5764563106796116
d = 12, Average accuracy: 0.5968446601941748
d = 13, Average accuracy: 0.5910194174757282
d = 14, Average accuracy: 0.6157766990291262
Best d value: 14, with average accuracy: 0.6157766990291262

Classification report:
              precision    recall  f1-score   support

           0       0.59      0.62      0.60      1929
           1       0.65      0.62      0.63      2191

    accuracy                           0.62      4120
   macro avg       0.62      0.62      0.62      4120
weighted avg       0.62

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
