In [52]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

import torch
import torch.nn as nn

# import tensorflow as tf
# from innvestigate.utils.keras import checks
# from innvestigate.utils.keras import checks as kchecks
# from innvestigate.utils.keras import backend as kb
# from innvestigate.utils.keras import applications as kapp
# from innvestigate import create_analyzer

## Functions for Feature Importance

In [53]:
def get_linear_regression_feature_importance(model_file, feature_names):
    """
    Extracts feature importance from a linear regression model stored in a .sav file.
    
    Parameters:
    - model_file (str): File path to the .sav file containing the linear regression model.
    - feature_names (list): List of feature names.
    
    Returns:
    - DataFrame: DataFrame containing the feature names and their corresponding importance.
    """

    model = joblib.load(model_file)

    coefficients = model.coef_

    absolute_coefficients = np.abs(coefficients)

    scaler = StandardScaler()
    normalized_coefficients = scaler.fit_transform(absolute_coefficients.reshape(-1, 1)).flatten()

    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': normalized_coefficients})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    return feature_importance_df


def get_svm_feature_importance(model_file, feature_names, X_train=[]):
    """
    Extracts feature importance from an SVM model stored in a .sav file.
    
    Parameters:
    - model_file (str): File path to the .sav file containing the SVM model.
    - feature_names (list): List of feature names.
    - X_train (DataFrame or array-like): Training data used to fit the SVM model.
    
    Returns:
    - DataFrame: DataFrame containing the feature names and their corresponding importance.
    """
    model = joblib.load(model_file)

    if model.kernel == 'linear':
        coefficients = model.coef_.flatten()
        importance = np.abs(coefficients)
    else:
        print("Feature importance for non-linear SVMs is not implemented yet. (Hard and Need to be done during the training processes)")
        return 
    
    scaler = StandardScaler()
    normalized_importance = scaler.fit_transform(importance.reshape(-1, 1)).flatten()

    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': normalized_importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    return feature_importance_df


def get_lstm_feature_importance_from_file(model_file, feature_names, device="cpu"):
    """
    Extracts feature importance from the weights of an LSTM model stored in a .pth.tar file.
    
    Parameters:
    - model_file (str): File path to the .pth.tar file containing the weights of the LSTM model.
    - feature_names (list): List of feature names.
    
    Returns:
    - DataFrame: DataFrame containing the feature names and their corresponding importance.
    """
    state_dict = torch.load(model_file, map_location=torch.device(device))
    model_state_dict = state_dict['model']

    hidden_size = 256
    input_size = 1
    num_layers = 4
    lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    linear1 = nn.Linear(hidden_size, 64)
    linear2 = nn.Linear(64, 1)

    lstm_weight_ih = model_state_dict['layer1.weight_ih_l0'].T  # Shape: (hidden_size*4, input_size)
    lstm_weight_hh = model_state_dict['layer1.weight_hh_l0'].T  # Shape: (hidden_size*4, hidden_size)
    lstm_bias_ih = model_state_dict['layer1.bias_ih_l0']  # Shape: (hidden_size*4,)
    lstm_bias_hh = model_state_dict['layer1.bias_hh_l0']  # Shape: (hidden_size*4,)
    linear1_weight = model_state_dict['layer2.0.weight'].T  # Shape: (64, hidden_size)
    linear1_bias = model_state_dict['layer2.0.bias']  # Shape: (64,)
    linear2_weight = model_state_dict['layer3.weight'].T  # Shape: (1, 64)
    linear2_bias = model_state_dict['layer3.bias']  # Shape: (1,)

    importance1 = np.abs(np.matmul(lstm_weight_ih, np.diag(lstm_weight_hh.flatten())) + lstm_bias_ih + lstm_bias_hh)
    importance2 = np.abs(np.matmul(linear1_weight, linear2_weight.flatten())) + linear1_bias + linear2_bias
    importance = np.concatenate((importance1.flatten(), importance2))

    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    return feature_importance_df


def get_xgboost_feature_importance(model, feature_names):
    """
    Extracts feature importance from an XGBoost model.
    
    Parameters:
    - model: Trained XGBoost model.
    - feature_names (list): List of feature names.
    
    Returns:
    - DataFrame: DataFrame containing the feature names and their corresponding importance.
    """
    importance = model.feature_importances_

    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    return feature_importance_df



## Linear Regression

In [54]:
path = "models/LinearRegression.sav"
feature_names = ["Open", "High", "Low", "Close", "Volume"]

feature_importance_df = get_linear_regression_feature_importance(path, feature_names)
print(feature_importance_df)

  Feature  Importance
3   Close    1.997547
2     Low   -0.435193
1    High   -0.457862
0    Open   -0.531928
4  Volume   -0.572564


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [55]:
path = "models/LinearRegression_log.sav"
feature_names = ["Open", "High", "Low", "Close", "Volume"]

feature_importance_df = get_linear_regression_feature_importance(path, feature_names)
print(feature_importance_df)

  Feature  Importance
3   Close    1.182225
1    High    0.615189
0    Open    0.202764
2     Low   -0.231189
4  Volume   -1.768990


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## SVM

In [56]:
model_file_path = 'models/SVM.sav'
feature_names =  ["Open", "High", "Low", "Close", "Volume"]
X_train = [] 
feature_importance = get_svm_feature_importance(model_file_path, feature_names, X_train)
print(feature_importance)

Feature importance for non-linear SVMs is not implemented yet. (Hard and Need to be done during the training processes)
None


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## XGBoost

In [None]:
# parse an XGBoost model directly in the function 
# feature_importance = get_xgboost_feature_importance(xgb_model, feature_names)
# print("Ranked Feature Importance:")
# print(feature_importance)

## LSTM

In [57]:

model_file_path = 'models/LSTM1.pth.tar'
feature_names = ["Open", "High", "Low", "Close", "Volume"]

feature_importance = get_lstm_feature_importance_from_file(model_file_path, feature_names)
print("Ranked Feature Importance:")
print(feature_importance)

MemoryError: Unable to allocate 256. GiB for an array with shape (262144, 262144) and data type float32