# Overview

This notebook contain the process of finding the best configuration and learning rate for the GRU model that is used as one the base learner in this study. The dataset that is used is the selected feature from financial statements (based on the feature selection result) and also the best sentiment representation from the sentiment experiment result. The dataset that is used is the BBRI dataset.

# Base Model Search for Stock Prediction using fundamental and embedding sentiment feature (GRU)

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import L2
from matplotlib.ticker import MultipleLocator
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# DEFINE CONSTANT
TRAIN_SIZE = 0.70
VAL_SIZE = 0.15
TEST_SIZE = 0.15

# EXPERIMENT VARIABLE
# used learning rate -> 0.001, 0.005, 0.0001, 0.0005
LEARNING_RATE = 0.05

N_ITERATION = 20
N_COMPONENT_PCA = 0.80
TARGET_COLUMN_NAME = "Closing Price"
EMBEDDING_COLUMN_NAME = "text_embedding_multilingual_mpnet"
USE_PCA = True

CONTEXT_WINDOW = 5
NUM_EPOCHS = 100
LOSS = "mse"
METRICS = "mape"

In [None]:
fundamental_features  = [
  "Financing Cash Flow",
  "P/B Ratio",
  "P/S Ratio",
  "Capital Adequacy Ratio",
  "Debt to Assets Ratio",
  "Debt to Equity Ratio",
  "Investing Cash Flow",
  "Operating Cash Flow",
  "Return on Assets",
  "Operating Profit",
  "Loan to Deposit Ratio"
]

historic_price_feature = [
    'Opening Price',
    'Highest Price',
    'Lowest Price',
    'Volume',
    'Change'
]

print(f"Number of selected features: {len(fundamental_features)}")

In [None]:
def parse_embedding_from_df(df, embedding_column_name, n_comp, target_column_name="Closing Price"):

  feature_columns = fundamental_features + historic_price_feature

  column = [target_column_name, embedding_column_name] + feature_columns
  df_processed = df[column].copy()

  def parse_embedding_string(embedding_str):

      if not isinstance(embedding_str, str):
          return None # Handle non-string inputs (like NaN)

      # Remove brackets and split by whitespace
      embedding_str = embedding_str.strip().strip('[]')
      # Use regex to find all floating point numbers, including those in scientific notation
      numbers = re.findall(r"[-+]?\d*\.?\d+[eE][-+]?\d+|[-+]?\d*\.\d+|\d+", embedding_str)

      try:
          # Convert the extracted numbers to floats
          return [float(num) for num in numbers]
      except ValueError:
          return None # Return None if conversion to float fails for any number

  # Apply the parsing function to the embedding column and handle potential None values
  df_processed['embedding'] = df_processed[embedding_column_name].apply(parse_embedding_string)

  # Handle rows where parsing failed (e.g., by filling with zeros)
  # Determine the embedding dimension from the first successfully parsed embedding
  embedding_dim = None
  for embedding_list in df_processed['embedding']:
      if embedding_list is not None:
          embedding_dim = len(embedding_list)
          break

  if embedding_dim is None:
      # Handle case where all embeddings are None or invalid
      # Using a default BERT base dimension as a fallback.
      embedding_dim = 768 # Default BERT base dimension
      print(f"Warning: Could not determine embedding dimension from data. Using a default embedding dimension of {embedding_dim}.")

  df_processed['embedding'] = df_processed['embedding'].apply(lambda x: np.array(x, dtype=float) if x is not None else np.zeros(embedding_dim))

  def pca_reduce(embedding_list, n_comp=n_comp):
    pca = PCA(n_components=n_comp)
    embedding_reduced = pca.fit_transform(np.array(embedding_list.tolist())) # Convert list of arrays to numpy array
    return embedding_reduced

  df_processed['embedding_pca'] = pca_reduce(df_processed['embedding']).tolist()

  # Convert feature columns to numeric, coercing errors, and fill NaNs
  for col in feature_columns:
      df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

  return df_processed

In [None]:
def create_train_val_test(df_processed, use_pca, embedding_column_name, target_column_price="Closing Price"):

    # Separate target variable and features
    target_data = df_processed[target_column_price].values

    # Exclude the original embedding column and 'Date' from feature_columns
    feature_columns = [col for col in df_processed.columns if col not in [target_column_price, embedding_column_name, 'Date', 'embedding', 'embedding_pca']]
    feature_data = df_processed[feature_columns].values

    if use_pca:
      embedding_data = np.array(df_processed['embedding_pca'].tolist())
    else:
      embedding_data = np.array(df_processed['embedding'].tolist())

    # Scale the target variable
    scaler = MinMaxScaler()
    scaled_target_data = scaler.fit_transform(target_data.reshape(-1, 1))

    # Scale the feature data (excluding embedding)
    feature_scaler = MinMaxScaler()

    scaled_feature_data = feature_scaler.fit_transform(feature_data)

    # Combine scaled target, scaled features, and original embeddings
    # Target variable 'close' is at index 0
    combined_data = np.concatenate((scaled_target_data, embedding_data, scaled_feature_data), axis=1)

    # Calculate split indices
    n_total = len(combined_data) - CONTEXT_WINDOW
    train_split_index = int(n_total * TRAIN_SIZE)
    val_split_index = int(n_total * (TRAIN_SIZE + VAL_SIZE))

    # Split the data sequentially
    train_data = combined_data[:train_split_index + CONTEXT_WINDOW]
    val_data = combined_data[train_split_index:val_split_index + CONTEXT_WINDOW]
    test_data = combined_data[val_split_index:]

    train_generator = TimeseriesGenerator(train_data, train_data[:, 0], # Target is the first column (scaled 'close')
                                        length=CONTEXT_WINDOW, batch_size=24)

    val_generator = TimeseriesGenerator(val_data, val_data[:, 0], # Target is the first column (scaled 'close')
                                        length=CONTEXT_WINDOW, batch_size=24)

    test_generator = TimeseriesGenerator(test_data, test_data[:, 0], # Target is the first column (scaled 'close')
                                      length=CONTEXT_WINDOW, batch_size=24)

    return train_generator, val_generator, test_generator, scaler, combined_data

## GRU with Configuration 1

In [None]:
def create_model_gru_1(combined_data, print_summary=True):

  model = Sequential([
      Input(shape=(CONTEXT_WINDOW, combined_data.shape[1])),
      GRU(64, activation='relu'),
      Dense(32, activation='relu'),
      Dense(1, activation='linear')
  ])

  OPTIMIZER = Adam(learning_rate=LEARNING_RATE)
  model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=[METRICS])

  if print_summary:
    model.summary()

  return model

## GRU with Configuration 2

In [None]:
def create_model_gru_2(combined_data, print_summary=True):

  model = Sequential([
      Input(shape=(CONTEXT_WINDOW, combined_data.shape[1])),
      GRU(units=64,
          activation='relu',
          return_sequences=True,
          kernel_regularizer=L2(0.001)
      ),
      GRU(units=32,
          activation='relu',
          recurrent_dropout=0.25
      ),
      Dropout(0.25),
      Dense(32, activation='relu'),
      Dense(1, activation='linear')
  ])

  OPTIMIZER = Adam(learning_rate=LEARNING_RATE)
  model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=[METRICS])

  if print_summary:
    model.summary()

  return model

## GRU with Configuration 3

In [None]:
def create_model_gru_3(combined_data, print_summary=True):

  model = Sequential([
      Input(shape=(CONTEXT_WINDOW, combined_data.shape[1])),
      GRU(units=64,
          activation='relu',
          return_sequences=True,
          kernel_regularizer=L2(0.001)
      ),
      GRU(units=32,
          activation='relu',
          recurrent_dropout=0.25
      ),
      Dense(32, activation='relu'),
      Dense(1, activation='linear')
  ])

  OPTIMIZER = Adam(learning_rate=LEARNING_RATE)
  model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=[METRICS])

  if print_summary:
    model.summary()

  return model

In [None]:
def evaluate_and_visualize_model(model, train_generator, val_generator, test_generator, scaler, embedding_column_name, print_test_result=False, visualize=True):

  # Define Early Stopping callback
  early_stopping = EarlyStopping(
      monitor='val_loss', # Monitor validation loss
      patience=10,        # Number of epochs with no improvement after which training will be stopped.
      restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity.
  )

  # Train the model with Early Stopping
  history = model.fit(
      train_generator,
      validation_data=val_generator,
      epochs=NUM_EPOCHS,
      verbose=0,
      callbacks=[early_stopping] # Add the early stopping callback here
  )

  # Get predictions from the test set
  predictions_scaled = model.predict(test_generator, verbose=0)

  # Inverse transform the scaled predictions and actual values to their original scale
  predicted_actual = scaler.inverse_transform(predictions_scaled)
  true_actual = scaler.inverse_transform(test_generator.targets.reshape(-1, 1))

  # Adjust the length of true_actual to match predicted_actual
  true_actual = true_actual[:len(predicted_actual)]

  # Calculate evaluation metrics
  mse = mean_squared_error(true_actual, predicted_actual)
  mae = mean_absolute_error(true_actual, predicted_actual)
  mape = mean_absolute_percentage_error(true_actual, predicted_actual)

  if print_test_result:
    print("TEST RESULT")
    print(f"Test MSE: {mse:.3f}")
    print(f"Test MAE: {mae:.3f}")
    print(f"Test MAPE: {(mape*100):.2f}%")

  if visualize:
    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(true_actual, label='Actual close')
    plt.plot(predicted_actual, label=f'Predicted close with ({embedding_column_name})')

    # Add dots for each data point
    plt.scatter(range(len(true_actual)), true_actual, color='darkblue', s=12, label='Actual Points')
    plt.scatter(range(len(predicted_actual)), predicted_actual, color='red', s=12, label='Predicted Points')

    plt.title(f'{embedding_column_name} Predictions vs Actual Close Price')
    plt.xlabel('Time Step')
    plt.ylabel('Price')
    plt.legend()
    plt.grid(True)

    # Set x-axis major locator to 10
    plt.gca().xaxis.set_major_locator(MultipleLocator(10))

    plt.show()

  return mse, mae, mape

In [None]:
def find_average_test_score(df, embedding_name, use_pca, n_iter, model_number):
  processed_df = parse_embedding_from_df(df, embedding_name, n_comp=N_COMPONENT_PCA)
  train_generator, val_generator, test_generator, scaler, combined_data = create_train_val_test(processed_df, use_pca, embedding_name)
  arr_mse = []
  arr_mae = []
  arr_mape = []

  for i in tqdm(range(n_iter), desc=f"Processing Model Number {model_number}"):
    if model_number == 1:
      model = create_model_gru_1(combined_data, print_summary=False)
    elif model_number == 2:
      model = create_model_gru_2(combined_data, print_summary=False)
    elif model_number == 3:
      model = create_model_gru_3(combined_data, print_summary=False)

    mse, mae, mape = evaluate_and_visualize_model(model, train_generator, val_generator, test_generator, scaler, embedding_name, print_test_result=False, visualize=False)
    arr_mse.append(mse)
    arr_mae.append(mae)
    arr_mape.append(mape)

  print("\n\n")
  print("Final Result:")
  print(f"Average MSE: {np.mean(arr_mse):.3f}")
  print(f"Average MAE: {np.mean(arr_mae):.3f}")
  print(f"Average MAPE: {(np.mean(arr_mape)*100):.2f}%")

In [None]:
bbri_merged_df = pd.read_csv("final_bbri_dataset.csv")

## Experiment

In [None]:
# Prediction using model_conf = 1
MODEL_NUMBER_GRU_SIMPLE = 1

find_average_test_score(
    df=bbri_merged_df,
    embedding_name=EMBEDDING_COLUMN_NAME,
    use_pca=USE_PCA,
    n_iter=N_ITERATION,
    model_number=MODEL_NUMBER_GRU_SIMPLE
)

In [None]:
# Prediction using model_conf = 2
MODEL_NUMBER_GRU_COMPLICATED = 2

find_average_test_score(
    df=bbri_merged_df,
    embedding_name=EMBEDDING_COLUMN_NAME,
    use_pca=USE_PCA,
    n_iter=N_ITERATION,
    model_number=MODEL_NUMBER_GRU_COMPLICATED
)

In [None]:
# Prediction using model_conf = 3
MODEL_NUMBER_GRU_SPECIAL = 3

find_average_test_score(
    df=bbri_merged_df,
    embedding_name=EMBEDDING_COLUMN_NAME,
    use_pca=USE_PCA,
    n_iter=N_ITERATION,
    model_number=MODEL_NUMBER_GRU_SPECIAL
)