In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from google.colab import files
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Upload CSVs
print("First CSV file (phase1Model.csv):")
uploaded_1 = files.upload()
print("Second {enhanced} CSV file (m76r.csv):")
uploaded_2 = files.upload()

# Load dataframes
df1 = pd.read_csv(list(uploaded_1.keys())[0])
df2 = pd.read_csv(list(uploaded_2.keys())[0])

print("both dataframes captured. Let's check these datatypes.")

First CSV file (phase1Model.csv):


Saving phase1Model.csv to phase1Model.csv
Second {enhanced} CSV file (m76r.csv):


Saving m76r.csv to m76r.csv
both dataframes captured. Let's check these datatypes.


In [None]:
df1.columns.tolist()

['TOTBKCR',
 'DFF',
 'OEHRENWBSHNO',
 'CORESTICKM159SFRBATL',
 'PSAVERT',
 'CCSA',
 'STICKCPIM157SFRBATL',
 'GDP',
 'UNRATE',
 'CPALTT01USM657N',
 'UNEMPLOY',
 'GDPC1',
 'PRICE_SP500',
 'CHANGE_SP500',
 'WM1NS',
 'CPIAUCSL',
 'REAINTRATREARAT10Y',
 'MEDCPIM158SFRBCLE',
 'WM2NS',
 'FPCPITOTLZGUSA',
 'USREC']

In [None]:
df2.columns.tolist()

['TOTBKCR',
 'DFF',
 'OEHRENWBSHNO',
 'CORESTICKM159SFRBATL',
 'PSAVERT',
 'CCSA',
 'STICKCPIM157SFRBATL',
 'GDP',
 'UNRATE',
 'CPALTT01USM657N',
 'UNEMPLOY',
 'GDPC1',
 'WM1NS',
 'CPIAUCSL',
 'FPCPITOTLZGUSA',
 'is_inverted',
 'CCSA_Rolling3',
 'CPIAUCSL_Rolling3',
 'GDP_Rolling3',
 'STICKCPIM157SFRBATL_Rolling3',
 'PSAVERT_Rolling3',
 'DFF_Rolling3',
 'TOTBKCR_Rolling3',
 'CORESTICKM159SFRBATL_Rolling3',
 'GDPC1_Rolling3',
 'UNEMPLOY_Rolling3',
 'OEHRENWBSHNO_Rolling3',
 'CPALTT01USM657N_Rolling3',
 'WM1NS_Rolling3',
 'UNRATE_Rolling3',
 'FPCPITOTLZGUSA_Rolling3',
 'CCSA_Rolling12',
 'CPIAUCSL_Rolling12',
 'GDP_Rolling12',
 'STICKCPIM157SFRBATL_Rolling12',
 'PSAVERT_Rolling12',
 'DFF_Rolling12',
 'TOTBKCR_Rolling12',
 'CORESTICKM159SFRBATL_Rolling12',
 'GDPC1_Rolling12',
 'UNEMPLOY_Rolling12',
 'OEHRENWBSHNO_Rolling12',
 'CPALTT01USM657N_Rolling12',
 'WM1NS_Rolling12',
 'UNRATE_Rolling12',
 'FPCPITOTLZGUSA_Rolling12',
 'USREC']

In [None]:
## Use this block to pretty up the data types
print("Data types for df1:")
print(df1.dtypes)

print("\nData types for df2:")
print(df2.dtypes)
## the only offender across the board is DATE in df2. So.... drop!
df2 = df2.drop(columns=['DATE'])

## and now, ALL cells in these two dataframes contain either float64s or int64s.

Data types for df1:
TOTBKCR                 float64
DFF                     float64
OEHRENWBSHNO            float64
CORESTICKM159SFRBATL    float64
PSAVERT                 float64
CCSA                    float64
STICKCPIM157SFRBATL     float64
GDP                     float64
UNRATE                  float64
CPALTT01USM657N         float64
UNEMPLOY                float64
GDPC1                   float64
PRICE_SP500             float64
CHANGE_SP500            float64
WM1NS                   float64
CPIAUCSL                float64
REAINTRATREARAT10Y      float64
MEDCPIM158SFRBCLE       float64
WM2NS                   float64
FPCPITOTLZGUSA          float64
USREC                   float64
dtype: object

Data types for df2:
DATE                               object
TOTBKCR                           float64
DFF                               float64
OEHRENWBSHNO                      float64
CORESTICKM159SFRBATL              float64
PSAVERT                           float64
CCSA                 

In [None]:
pairs = [[6, 16,], [6,32], [6,64], [12, 16], [12,32], [12,64], [18, 16], [18,32], [18,64]]

for elem in pairs:
  sequence_length, batch_size = elem[0], elem[1]

  # Parameters
  #sequence_length = 6, 12, or 18
  hidden_size = 64
  num_epochs = 200
  #batch_size = 16, 32 or 64
  output_size = 1  # Binary classification

  # Preprocess function
  def preprocess_dataframe(df, sequence_length):
    # Separate features and labels
    label_col = df.columns[-1]
    X = df.drop(label_col, axis=1).values
    y = df[label_col].values

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Create sequences
    def create_sequences(X, y, seq_length):
        X_seq, y_seq = [], []
        for i in range(len(X) - seq_length):
            X_seq.append(X[i:i + seq_length])
            y_seq.append(y[i + seq_length])
        return torch.tensor(X_seq, dtype=torch.float32), torch.tensor(y_seq, dtype=torch.float32)

    X_seq, y_seq = create_sequences(X_scaled, y, sequence_length)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

    # Create DataLoaders
    train_data = TensorDataset(X_train, y_train.unsqueeze(1))
    test_data = TensorDataset(X_test, y_test.unsqueeze(1))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader, X_seq.shape[2]

  # Define RNN Model
  class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, hn = self.rnn(x)
        out = self.fc(hn[-1])
        return out

  # Define LSTM Model
  class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

  # Training and evaluation function
  def train_and_evaluate_with_metrics(train_loader, test_loader, model_class, input_size, hidden_size, output_size, num_epochs):
    model = model_class(input_size, hidden_size, output_size)
    criterion = nn.BCEWithLogitsLoss()  # hybrid sigmoid - binary cross entropy loss. Great for binary classification tasks apparently.
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch).squeeze(1)  # Squeeze predictions to match the shape of y_batch
            loss = criterion(predictions, y_batch.squeeze(1))  # Squeeze y_batch to match predictions
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            predictions = torch.sigmoid(model(X_batch).squeeze(1))  # Squeeze predictions to match y_batch
            y_true.extend(y_batch.squeeze(1).numpy())  # Squeeze y_batch for consistency
            y_pred.extend((predictions.numpy() > 0.5).astype(int))  # Convert to binary predictions

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

  # Process both datasets
  datasets = [df1, df2]
  results = []
  for i, df in enumerate(datasets):
    print(f"Processing Dataset {i+1}...")
    train_loader, test_loader, input_size = preprocess_dataframe(df, sequence_length)

    # Evaluate RNN
    rnn_metrics = train_and_evaluate_with_metrics(train_loader, test_loader, RNNModel, input_size, hidden_size, output_size, num_epochs)

    # Evaluate LSTM
    lstm_metrics = train_and_evaluate_with_metrics(train_loader, test_loader, LSTMModel, input_size, hidden_size, output_size, num_epochs)

    results.append({
        'Dataset': f'Dataset_{i+1}',
        'RNN Accuracy': rnn_metrics[0],
        'RNN Precision': rnn_metrics[1],
        'RNN Recall': rnn_metrics[2],
        'RNN F1': rnn_metrics[3],
        'LSTM Accuracy': lstm_metrics[0],
        'LSTM Precision': lstm_metrics[1],
        'LSTM Recall': lstm_metrics[2],
        'LSTM F1': lstm_metrics[3],
    })

  # Display results
  results_df = pd.DataFrame(results)
  print(f"SEQUENCE LENGTH = {sequence_length};\tBATCH SIZE = {batch_size}")
  print(results_df)
  print("\n\n")

Processing Dataset 1...


  return torch.tensor(X_seq, dtype=torch.float32), torch.tensor(y_seq, dtype=torch.float32)


Processing Dataset 2...
SEQUENCE LENGTH = 6;	BATCH SIZE = 16
     Dataset  RNN Accuracy  RNN Precision  RNN Recall    RNN F1  \
0  Dataset_1      0.911111       0.400000    0.666667  0.500000   
1  Dataset_2      0.971154       0.769231    1.000000  0.869565   

   LSTM Accuracy  LSTM Precision  LSTM Recall   LSTM F1  
0       0.955556        0.666667     0.666667  0.666667  
1       0.980769        0.833333     1.000000  0.909091  



Processing Dataset 1...
Processing Dataset 2...
SEQUENCE LENGTH = 6;	BATCH SIZE = 32
     Dataset  RNN Accuracy  RNN Precision  RNN Recall    RNN F1  \
0  Dataset_1      0.955556       0.666667    0.666667  0.666667   
1  Dataset_2      0.971154       0.888889    0.800000  0.842105   

   LSTM Accuracy  LSTM Precision  LSTM Recall   LSTM F1  
0       0.955556        0.750000          0.5  0.600000  
1       0.980769        0.833333          1.0  0.909091  



Processing Dataset 1...
Processing Dataset 2...
SEQUENCE LENGTH = 6;	BATCH SIZE = 64
     Datase

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing Dataset 2...
SEQUENCE LENGTH = 18;	BATCH SIZE = 64
     Dataset  RNN Accuracy  RNN Precision  RNN Recall  RNN F1  LSTM Accuracy  \
0  Dataset_1      0.863636           0.00        0.00    0.00       0.909091   
1  Dataset_2      0.941176           0.75        0.75    0.75       0.931373   

   LSTM Precision  LSTM Recall  LSTM F1  
0        0.750000         0.50     0.60  
1        0.692308         0.75     0.72  





In [None]:
##Feature Importance##

def permutation_importance(df, model_class, sequence_length, batch_size, hidden_size, num_epochs):
    label_col = df.columns[-1]  # Assuming label column is the last
    X_original = df.drop(columns=[label_col]).copy()
    y = df[label_col].copy()
    features = X_original.columns.tolist()
    importances = []

    # Baseline performance
    print("Calculating baseline performance...")
    train_loader, test_loader, input_size = preprocess_dataframe(df, sequence_length)
    baseline_metrics = train_and_evaluate_with_metrics(train_loader, test_loader, model_class, input_size, hidden_size, 1, num_epochs)
    baseline_f1 = baseline_metrics[3]  # F1 score
    print(f"Baseline F1 Score: {baseline_f1}\n")

    # Permutation Loop
    for feature in features:
        print(f"Shuffling Feature: {feature}...")
        df_permuted = df.copy()
        df_permuted[feature] = np.random.permutation(df_permuted[feature].values)  # Shuffle one feature

        # Preprocess and evaluate after shuffling
        train_loader, test_loader, input_size = preprocess_dataframe(df_permuted, sequence_length)
        shuffled_metrics = train_and_evaluate_with_metrics(train_loader, test_loader, model_class, input_size, hidden_size, 1, num_epochs)
        shuffled_f1 = shuffled_metrics[3]

        # Calculate importance as drop in F1 score
        importance = baseline_f1 - shuffled_f1
        importances.append((feature, importance))
        print(f"F1 Score Drop for {feature}: {importance}\n")

    # Sort features by importance
    importances.sort(key=lambda x: x[1], reverse=True)
    return importances

# Run permutation importance for both datasets
sequence_length = 18
batch_size = 16
hidden_size = 64
num_epochs = 200

for i, df in enumerate([df1, df2]):
    print(f"\nPermutation Feature Importance for Dataset {i+1} using RNN:")
    rnn_importances = permutation_importance(df, RNNModel, sequence_length, batch_size, hidden_size, num_epochs)
    print("RNN Feature Importances (Descending):")
    for feature, importance in rnn_importances:
        print(f"{feature}: {importance:.4f}")

    print(f"\nPermutation Feature Importance for Dataset {i+1} using LSTM:")
    lstm_importances = permutation_importance(df, LSTMModel, sequence_length, batch_size, hidden_size, num_epochs)
    print("LSTM Feature Importances (Descending):")
    for feature, importance in lstm_importances:
        print(f"{feature}: {importance:.4f}")

print("\nFeature importance analysis complete.")



Permutation Feature Importance for Dataset 1 using RNN:
Calculating baseline performance...
Baseline F1 Score: 0.6666666666666666

Shuffling Feature: TOTBKCR...
F1 Score Drop for TOTBKCR: 0.0

Shuffling Feature: DFF...
F1 Score Drop for DFF: 0.0

Shuffling Feature: OEHRENWBSHNO...
F1 Score Drop for OEHRENWBSHNO: 0.07843137254901955

Shuffling Feature: CORESTICKM159SFRBATL...
F1 Score Drop for CORESTICKM159SFRBATL: -0.07017543859649122

Shuffling Feature: PSAVERT...
F1 Score Drop for PSAVERT: -0.07017543859649122

Shuffling Feature: CCSA...
F1 Score Drop for CCSA: 0.07843137254901955

Shuffling Feature: STICKCPIM157SFRBATL...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 Score Drop for STICKCPIM157SFRBATL: 0.6666666666666666

Shuffling Feature: GDP...
F1 Score Drop for GDP: -0.13333333333333341

Shuffling Feature: UNRATE...
F1 Score Drop for UNRATE: -0.1515151515151516

Shuffling Feature: CPALTT01USM657N...
F1 Score Drop for CPALTT01USM657N: -0.09523809523809523

Shuffling Feature: UNEMPLOY...
F1 Score Drop for UNEMPLOY: -0.2898550724637682

Shuffling Feature: GDPC1...
F1 Score Drop for GDPC1: 0.07843137254901955

Shuffling Feature: PRICE_SP500...
F1 Score Drop for PRICE_SP500: -0.2898550724637682

Shuffling Feature: CHANGE_SP500...
F1 Score Drop for CHANGE_SP500: 0.0

Shuffling Feature: WM1NS...
F1 Score Drop for WM1NS: -0.29333333333333333

Shuffling Feature: CPIAUCSL...
F1 Score Drop for CPIAUCSL: -0.2898550724637682

Shuffling Feature: REAINTRATREARAT10Y...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 Score Drop for REAINTRATREARAT10Y: 0.6666666666666666

Shuffling Feature: MEDCPIM158SFRBCLE...
F1 Score Drop for MEDCPIM158SFRBCLE: -0.07017543859649122

Shuffling Feature: WM2NS...
F1 Score Drop for WM2NS: 0.196078431372549

Shuffling Feature: FPCPITOTLZGUSA...
F1 Score Drop for FPCPITOTLZGUSA: -0.19047619047619047

RNN Feature Importances (Descending):
STICKCPIM157SFRBATL: 0.6667
REAINTRATREARAT10Y: 0.6667
WM2NS: 0.1961
OEHRENWBSHNO: 0.0784
CCSA: 0.0784
GDPC1: 0.0784
TOTBKCR: 0.0000
DFF: 0.0000
CHANGE_SP500: 0.0000
CORESTICKM159SFRBATL: -0.0702
PSAVERT: -0.0702
MEDCPIM158SFRBCLE: -0.0702
CPALTT01USM657N: -0.0952
GDP: -0.1333
UNRATE: -0.1515
FPCPITOTLZGUSA: -0.1905
UNEMPLOY: -0.2899
PRICE_SP500: -0.2899
CPIAUCSL: -0.2899
WM1NS: -0.2933

Permutation Feature Importance for Dataset 1 using LSTM:
Calculating baseline performance...
Baseline F1 Score: 0.6666666666666666

Shuffling Feature: TOTBKCR...
F1 Score Drop for TOTBKCR: -0.2564102564102565

Shuffling Feature: DFF...
F1 Score Drop