# Overview

This notebook contain the process of feature selection to find the best feature from the financial statement that really influence the model's prediction. The sentiment data that is used in the dataset is the best sentiment representation from the previous experiment, which is the experiment to find sentiment representation. The dataset that is used for this experiment is the BBRI dataset.

# Fundamental Feature Importance Scoring

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# DEFINE CONSTANT
TRAIN_SIZE = 0.70
VAL_SIZE = 0.15
TEST_SIZE = 0.15

N_COMPONENT_PCA = 0.8
TARGET_COLUMN_NAME = "Closing Price"
EMBEDDING_COLUMN_NAME = "text_embedding_finbert"

USE_PCA = True
N_ITERATION = 20
CONTEXT_WINDOW = 5
NUM_EPOCHS = 100
LEARNING_RATE = 0.001
LOSS = "mse"

In [None]:
fundamental_features = [
    'Cash',
    'Revenue',
    'Net Income',
    'Operating Profit',
    'Total Assets',
    'Total Liabilities',
    'Total Equity',
    'Earnings per Share',
    'Operating Cash Flow',
    'Financing Cash Flow',
    'Investing Cash Flow',
    'Loan to Deposit Ratio',
    'Capital Adequacy Ratio',
    'Gross NPL',
    'Net NPL',
    'Net Interest Margin',
    'P/E Ratio',
    'P/B Ratio',
    'P/S Ratio',
    'Operating Margin',
    'Debt to Equity Ratio',
    'Debt to Assets Ratio',
    'Return on Equity',
    'Return on Assets'
]

historic_price_feature = [
    'Opening Price',
    'Highest Price',
    'Lowest Price',
    'Volume',
    'Change'
]

print(f"Length of fundamental features: {len(fundamental_features)}")

In [None]:
def parse_embedding_from_df(df, embedding_column_name, n_comp, target_column_name="Closing Price"):

  feature_columns = fundamental_features + historic_price_feature
  column = [target_column_name, embedding_column_name] + feature_columns
  df_processed = df[column].copy()

  def parse_embedding_string(embedding_str):

      if not isinstance(embedding_str, str):
          return None # Handle non-string inputs (like NaN)

      # Remove brackets and split by whitespace
      embedding_str = embedding_str.strip().strip('[]')
      # Use regex to find all floating point numbers, including those in scientific notation
      numbers = re.findall(r"[-+]?\d*\.?\d+[eE][-+]?\d+|[-+]?\d*\.\d+|\d+", embedding_str)

      try:
          # Convert the extracted numbers to floats
          return [float(num) for num in numbers]
      except ValueError:
          return None # Return None if conversion to float fails for any number

  # Apply the parsing function to the embedding column and handle potential None values
  df_processed['embedding'] = df_processed[embedding_column_name].apply(parse_embedding_string)

  # Handle rows where parsing failed (e.g., by filling with zeros)
  # Determine the embedding dimension from the first successfully parsed embedding
  embedding_dim = None
  for embedding_list in df_processed['embedding']:
      if embedding_list is not None:
          embedding_dim = len(embedding_list)
          break

  if embedding_dim is None:
      # Handle case where all embeddings are None or invalid
      # Using a default BERT base dimension as a fallback.
      embedding_dim = 768 # Default BERT base dimension
      print(f"Warning: Could not determine embedding dimension from data. Using a default embedding dimension of {embedding_dim}.")

  df_processed['embedding'] = df_processed['embedding'].apply(lambda x: np.array(x, dtype=float) if x is not None else np.zeros(embedding_dim))

  def pca_reduce(embedding_list, n_comp=n_comp):
    pca = PCA(n_components=n_comp)
    embedding_reduced = pca.fit_transform(np.array(embedding_list.tolist())) # Convert list of arrays to numpy array
    return embedding_reduced

  df_processed['embedding_pca'] = pca_reduce(df_processed['embedding']).tolist()

  # Convert feature columns to numeric, coercing errors, and fill NaNs
  for col in feature_columns:
      df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

  return df_processed

In [None]:
def create_static_dataset(series_data, look_back):
  dataX, dataY = [], []
  for i in range(len(series_data)-look_back):
    dataX.append(series_data[i:(i + look_back), :])

    dataY.append(series_data[i + look_back, 0])

  return np.array(dataX), np.array(dataY)

In [None]:
bbri_merged_df = pd.read_csv("final_bbri_dataset.csv")
df_processed = parse_embedding_from_df(bbri_merged_df, EMBEDDING_COLUMN_NAME, n_comp=N_COMPONENT_PCA)

In [None]:
print("Column in the dataset")
for index, col in enumerate(df_processed.columns.tolist()):
  print(f"{index+1}. {col}")

In [None]:
target_data = df_processed[TARGET_COLUMN_NAME].values

# exclude original embedding, processed embedding, embedding_pca, target column, and date from the fundamental feature columns
feature_columns = [col for col in df_processed.columns if col not in ([TARGET_COLUMN_NAME, EMBEDDING_COLUMN_NAME, 'Date', 'embedding', 'embedding_pca'])]
feature_data = df_processed[feature_columns].values

# use the original embedding, not the PCA
if USE_PCA:
  embedding_data = np.array(df_processed['embedding_pca'].tolist())
else:
  embedding_data = np.array(df_processed['embedding'].tolist())

print(f"Length of embedding data: {len(embedding_data[0])}")

scaler = MinMaxScaler()
scaled_target_data = scaler.fit_transform(target_data.reshape(-1, 1))

feature_scaler = MinMaxScaler()
scaled_feature_data = feature_scaler.fit_transform(feature_data)

combined_data = np.concatenate((scaled_target_data, scaled_feature_data, embedding_data), axis=1)

print(f"Length of combined data: {len(combined_data[0])}")

In [None]:
n_total = len(combined_data) - CONTEXT_WINDOW
train_split_index = int(n_total * TRAIN_SIZE)
val_split_index = int(n_total * (TRAIN_SIZE + VAL_SIZE))

train_data = combined_data[:train_split_index + CONTEXT_WINDOW]
val_data = combined_data[train_split_index:val_split_index + CONTEXT_WINDOW]
test_data = combined_data[val_split_index:]

X_train, y_train = create_static_dataset(train_data, CONTEXT_WINDOW)
X_val, y_val = create_static_dataset(val_data, CONTEXT_WINDOW)
X_test, y_test = create_static_dataset(test_data, CONTEXT_WINDOW)

In [None]:
# store the baseline error value for each iteration
# store the permutation importance score for each feature in each iteration
# each iteration permutation importance is in a dictionary form

baseline_error_store = {}
permutation_importance_store = {}

for i in tqdm(range(N_ITERATION), desc='Processing Permutation Importance for Fundamental Feature'):
  early_stopping = EarlyStopping(
      monitor='loss',
      patience=10,
      restore_best_weights=True
  )

  model = Sequential([
      Input(shape=(CONTEXT_WINDOW, combined_data.shape[1])),
      LSTM(64, activation='relu'),
      Dense(32, activation='relu'),
      Dense(1, activation='linear')
    ])

  OPTIMIZER = Adam(learning_rate=LEARNING_RATE)
  model.compile(optimizer=OPTIMIZER, loss=LOSS)

  history = model.fit(
      X_train,
      y_train,
      epochs=NUM_EPOCHS,
      verbose=0,
      batch_size=32,
      callbacks=[early_stopping],
      validation_data=(X_val, y_val)
  )

  predictions_scaled = model.predict(X_test, verbose=0)
  predictions_actual = scaler.inverse_transform(predictions_scaled)
  true_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

  baseline_error = mean_absolute_percentage_error(true_actual, predictions_actual)

  baseline_key = "baseline_error_" + str(i+1)
  baseline_error = baseline_error * 100
  baseline_error_store[baseline_key] = baseline_error

  importances_dict_store = {}

  for j in range(len(fundamental_features)):
    feature_name = fundamental_features[j]
    X_test_shuffled = X_test.copy()

    # the column that was shuffled is i+1 because of the first column is placed by the target column
    np.random.shuffle(X_test_shuffled[:, :, j+1])
    predictions_scaled = model.predict(X_test_shuffled, verbose=0)

    predictions_actual = scaler.inverse_transform(predictions_scaled)
    true_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

    permuted_error = mean_absolute_percentage_error(true_actual, predictions_actual)
    permuted_error = permuted_error * 100
    importance_score = (permuted_error - baseline_error)

    importances_dict_store[feature_name] = importance_score

  permutation_importance_key = "permutation_importance_" + str(i+1)
  permutation_importance_store[permutation_importance_key] = importances_dict_store

In [None]:
model.summary()

In [None]:
# find average baseline error

sum_baseline_error = 0
for key in baseline_error_store:
  sum_baseline_error += baseline_error_store[key]

average_baseline_error = sum_baseline_error / len(baseline_error_store)

print(f"Average baseline error: {average_baseline_error:.3f}")

In [None]:
# move each permutation importance from dict to list
list_perm_importance = []
for key in permutation_importance_store:
  permutation_importance = permutation_importance_store[key]
  list_perm_importance.append(permutation_importance)

# find average feature importance error
averages = {key: sum(d[key] for d in list_perm_importance) / len(list_perm_importance) for key in list_perm_importance[0]}

averages_sorted = sorted(averages.items(), key=lambda item: item[1], reverse=True)

In [None]:
feature_dict = {}
for item in averages_sorted:
  feature_dict[item[0]] = item[1]
  print(item)

# Create a DataFrame with a single row
feature_importance_df = pd.DataFrame([feature_dict])
feature_importance_df.to_csv("fundamental_feature_importance_50_iteration.csv", index=False)

In [None]:
features, scores = zip(*averages_sorted)
# Sort features and scores in descending order based on scores
sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i])
sorted_features = [features[i] for i in sorted_indices]
sorted_scores = [scores[i] for i in sorted_indices]

plt.figure(figsize=(10, 6))
plt.barh(sorted_features, sorted_scores, color='mediumseagreen')
plt.xlabel("Performance Drop (Increase in MAPE)")
plt.title("Permutation Importance All Fundamental Features")
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.show()