ENGINEERING CNN-LSTM BASED ON COMBINED_DATASET

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

combined_data = pd.read_csv('combined_dataset.csv')

combined_data = combined_data[::-1].reset_index(drop=True)

combined_data = combined_data.applymap(
    lambda x: float(str(x).lstrip("'")) if isinstance(x, (int, float, str)) and str(x).startswith("'") else x
)

column_to_move = 'change'
columns = [col for col in combined_data.columns if col != column_to_move] + [column_to_move]
combined_data = combined_data[columns]

combined_data['directional_change'] = combined_data['change'].apply(lambda x: x > 0)

combined_data.head()

  combined_data = combined_data.applymap(


Unnamed: 0,Date,OJ_Open,OJ_High,OJ_Low,OJ_Close,OJ_adj_close,OJ_Volume,Open,High,Low,...,Corn_adj_close,Corn_Volume,Soybean_Open,Soybean_High,Soybean_Low,Soybean_Close,Soybean_adj_close,Soybean_Volume,change,directional_change
0,2007-07-30,141.75,142.0,139.0,139.7,139.7,1836.0,112.5,113.45,111.6,...,323.5,25315.0,818.0,825.0,815.5,822.75,822.75,19018.0,1.35,True
1,2007-07-31,139.0,141.15,138.65,140.0,140.0,1746.0,112.95,114.4,112.5,...,325.75,30891.0,823.5,838.0,823.5,834.0,834.0,11921.0,-1.2,False
2,2007-08-01,140.1,141.8,139.0,139.15,139.15,2290.0,113.8,114.5,112.75,...,319.0,25569.0,834.5,842.75,823.5,825.75,825.75,10742.0,3.1,True
3,2007-08-02,140.0,141.75,139.2,141.25,141.25,2153.0,113.15,116.6,113.15,...,324.0,20120.0,827.0,835.5,826.75,831.0,831.0,5663.0,1.45,True
4,2007-08-03,140.0,142.5,135.1,142.45,142.45,2476.0,116.5,118.4,116.2,...,326.5,24814.0,833.0,841.25,833.0,839.0,839.0,4781.0,0.75,True


In [43]:
combined_data = combined_data.fillna(method="ffill").fillna(method="bfill")

features_for_pca = combined_data.drop(columns=["Date", "change", "directional_change"])

scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features_for_pca)

# Apply PCA
pca = PCA(n_components=0.95)
reduced_features = pca.fit_transform(normalized_features)

reduced_features_df = pd.DataFrame(
    reduced_features, columns=[f"PC{i+1}" for i in range(reduced_features.shape[1])]
)

# Add the Date and target column ('change') back into the new dataset
reduced_features_df["Date"] = combined_data["Date"].values
reduced_features_df["change"] = combined_data["change"].values
reduced_features_df["directional_change"] = combined_data["directional_change"].values

  combined_data = combined_data.fillna(method="ffill").fillna(method="bfill")


In [44]:
# TRUE MEANS POSITIVE CHANGE, FALSE MEANS NEGATIVE --> WILL MAP TO 1 AND 0 LATER
reduced_features_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,Date,change,directional_change
0,-1.81683,-0.804564,-0.041428,0.047606,0.574845,0.302695,-0.469479,-0.078834,0.068924,0.111088,-0.098367,-0.073523,-0.085615,-0.147709,0.101647,-0.343325,0.2831,2007-07-30,1.35,True
1,-1.783443,-0.819261,-0.053107,0.079708,0.594746,0.302641,-0.216791,-0.046196,0.115104,0.029543,-0.053936,-0.075321,-0.131259,-0.162499,0.024478,-0.32954,0.307166,2007-07-31,-1.2,False
2,-1.784261,-0.776942,-0.080797,0.102393,0.532047,0.316983,0.04904,0.146209,0.096331,0.094213,-0.053582,-0.120693,-0.117246,-0.144571,-0.057293,-0.100491,0.193179,2007-08-01,3.1,True
3,-1.769021,-0.764975,-0.076466,0.099041,0.520718,0.324865,0.301018,0.176992,0.114158,0.086755,-0.009093,-0.084593,-0.065193,-0.154241,-0.11228,-0.075273,0.166903,2007-08-02,1.45,True
4,-1.750549,-0.785042,-0.084596,0.097922,0.520101,0.336285,0.549084,0.152172,0.163222,0.060858,-0.053994,-0.096616,-0.106836,-0.150627,-0.077779,-0.107441,0.179963,2007-08-03,0.75,True


DIRECTIONAL CHANGES

In [45]:
X = reduced_features_df.drop(columns=["Date", "directional_change"]).values
y = reduced_features_df["directional_change"].astype(int).values  # Ensure y is binary (0/1)

split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

TIME_STEPS = 1
FEATURES = X_train.shape[1]

X_train_reshaped = X_train.reshape(X_train.shape[0], TIME_STEPS, FEATURES)
X_test_reshaped = X_test.reshape(X_test.shape[0], TIME_STEPS, FEATURES)

In [46]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout, Reshape
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# CNN-LSTM Model (Binary Classification)
def build_cnn_lstm(input_shape):
    model = Sequential([
        Conv1D(filters=128, kernel_size=1, activation='relu', input_shape=input_shape),
        Dropout(0.3),
        Flatten(),
        Reshape((1, 128)),
        LSTM(100, return_sequences=False),
        Dropout(0.3),
        Dense(100, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# LSTM-CNN Model (Binary Classification)
def build_lstm_cnn(input_shape):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=input_shape),
        Dropout(0.3),
        Conv1D(filters=128, kernel_size=1, activation='relu'),
        MaxPooling1D(pool_size=1),
        Flatten(),
        Dense(100, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Function
def train_and_evaluate(model, X_train, y_train, X_test, y_test, batch_size=32, epochs=25):
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
    return model

print("Training CNN-LSTM Model...")
cnn_lstm_model = build_cnn_lstm(input_shape=(TIME_STEPS, FEATURES))
cnn_lstm_model = train_and_evaluate(cnn_lstm_model, X_train_reshaped, y_train, X_test_reshaped, y_test)

print("\nTraining LSTM-CNN Model...")
lstm_cnn_model = build_lstm_cnn(input_shape=(TIME_STEPS, FEATURES))
lstm_cnn_model = train_and_evaluate(lstm_cnn_model, X_train_reshaped, y_train, X_test_reshaped, y_test)

y_pred_cnn_lstm = (cnn_lstm_model.predict(X_test_reshaped).flatten() > 0.5).astype(int)
y_pred_lstm_cnn = (lstm_cnn_model.predict(X_test_reshaped).flatten() > 0.5).astype(int)

# Create seperate test set so that we can see the results easier
testing_set = reduced_features_df.iloc[split_idx:].copy()
testing_set['predicted_cnn_lstm'] = y_pred_cnn_lstm
testing_set['predicted_lstm_cnn'] = y_pred_lstm_cnn
testing_set['actual_directional_change'] = y_test

testing_set['cnn_lstm_correct'] = testing_set['predicted_cnn_lstm'] == testing_set['actual_directional_change']
testing_set['lstm_cnn_correct'] = testing_set['predicted_lstm_cnn'] == testing_set['actual_directional_change']

cnn_lstm_correct_rows = testing_set[testing_set['cnn_lstm_correct']]
cnn_lstm_incorrect_rows = testing_set[~testing_set['cnn_lstm_correct']]

lstm_cnn_correct_rows = testing_set[testing_set['lstm_cnn_correct']]
lstm_cnn_incorrect_rows = testing_set[~testing_set['lstm_cnn_correct']]

# summary
print("\nSummary:")
print(f"CNN-LSTM Correct Predictions: {len(cnn_lstm_correct_rows)}")
print(f"CNN-LSTM Incorrect Predictions: {len(cnn_lstm_incorrect_rows)}")
print(f"LSTM-CNN Correct Predictions: {len(lstm_cnn_correct_rows)}")
print(f"LSTM-CNN Incorrect Predictions: {len(lstm_cnn_incorrect_rows)}")

# Display examples of correct and incorrect predictions
print("\nExamples of CNN-LSTM Correct Predictions:\n", cnn_lstm_correct_rows.head())
print("\nExamples of CNN-LSTM Incorrect Predictions:\n", cnn_lstm_incorrect_rows.head())
print("\nExamples of LSTM-CNN Correct Predictions:\n", lstm_cnn_correct_rows.head())
print("\nExamples of LSTM-CNN Incorrect Predictions:\n", lstm_cnn_incorrect_rows.head())


Training CNN-LSTM Model...
Epoch 1/25


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9017 - loss: 0.4056 - val_accuracy: 0.9889 - val_loss: 0.0471
Epoch 2/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9791 - loss: 0.0458 - val_accuracy: 0.9952 - val_loss: 0.0273
Epoch 3/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9860 - loss: 0.0335 - val_accuracy: 0.9905 - val_loss: 0.0275
Epoch 4/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9914 - loss: 0.0202 - val_accuracy: 0.9952 - val_loss: 0.0199
Epoch 5/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9905 - loss: 0.0230 - val_accuracy: 0.9905 - val_loss: 0.0237
Epoch 6/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9930 - loss: 0.0186 - val_accuracy: 0.9984 - val_loss: 0.0125
Epoch 7/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9174 - loss: 0.4431 - val_accuracy: 0.9842 - val_loss: 0.0559
Epoch 2/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9876 - loss: 0.0355 - val_accuracy: 0.9937 - val_loss: 0.0297
Epoch 3/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9913 - loss: 0.0226 - val_accuracy: 0.9984 - val_loss: 0.0174
Epoch 4/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9916 - loss: 0.0206 - val_accuracy: 0.9937 - val_loss: 0.0195
Epoch 5/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9966 - loss: 0.0136 - val_accuracy: 0.9937 - val_loss: 0.0185
Epoch 6/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9965 - loss: 0.0099 - val_accuracy: 0.9905 - val_loss: 0.0193
Epoch 7/25
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━

In [24]:
testing_set.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC16,PC17,Date,change,directional_change,predicted_cnn_lstm,predicted_lstm_cnn,actual_directional_change,cnn_lstm_correct,lstm_cnn_correct
3152,1.054851,0.034722,0.206307,-0.21756,0.486899,-0.836237,-0.487225,0.188518,-0.048016,-0.303726,...,0.140101,-0.300833,2021-05-24,1.5,True,1,1,1,True,True
3153,1.076072,0.09469,0.215062,-0.16704,0.454212,-0.736391,-0.252613,-0.068867,0.005042,-0.216872,...,-0.034141,-0.174238,2021-05-25,5.2,True,1,1,1,True,True
3154,1.019462,0.116346,0.238971,-0.279298,0.495184,-0.678906,0.004204,-0.046611,0.033794,-0.23626,...,0.370548,-0.480484,2021-05-26,-0.35,False,0,0,0,True,True
3155,1.111319,0.0254,0.188051,-0.084522,0.457223,-0.776876,0.253034,0.211066,0.064799,-0.481039,...,-0.108287,-0.262874,2021-05-27,7.0,True,1,1,1,True,True
3156,1.200448,0.044285,0.195144,-0.150152,0.194009,-0.752799,0.522288,0.38764,-0.050363,0.061684,...,-0.207577,-0.20986,2021-05-28,-1.3,False,0,0,0,True,True
