### LSTM NN 

In this notebook, we implement LSTM Neural Network for our classification task, seeing whether it is capable of outperforming our traditional ML models (Random Forest, XGBoost, etc.).
Our motivation for testing such type of model comes from its ability to capture long-term dependencies in time series data, which is a key feature of our dataset. Additionally, a large amount of papers read on the topic where aimig at assessing whether LSTM models would perform better than traditional ML models. These papers had opposing conclusions. In order to discernate, we will conduct the experiment ourselves. 

"""

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping

# Enable MPS acceleration for Apple Silicon
tf.config.set_visible_devices([], 'GPU')  # To ensure it uses the MPS backend
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Import the data and Prepare the data

In [4]:
# import our data
import pickle

# Load the data from the pickle file
with open("train_test_data.pkl", "rb") as f:
    X_train_final, X_test_final, y_train_encoded, y_test = pickle.load(f)

# Check the shapes of the imported data
print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape:", X_test_final.shape)
print("y_train_encoded shape:", y_train_encoded.shape)
print("y_test shape:", y_test.shape)

X_train_final shape: (73271, 68)
X_test_final shape: (18318, 68)
y_train_encoded shape: (73271,)
y_test shape: (18318,)


In [6]:
import numpy as np

# Check class distribution
unique_classes, class_counts = np.unique(y_train_encoded, return_counts=True)
class_distribution = dict(zip(unique_classes, class_counts))

print("Class Distribution in y_train_encoded:", class_distribution)

Class Distribution in y_train_encoded: {0: 8145, 1: 25621, 2: 39505}


In [7]:
# This is optional as we do not need to sample the data to train but it might perform better. We will compare later with non-sampled data later. 

import numpy as np
from sklearn.neighbors import NearestNeighbors
from collections import Counter
import pandas as pd

# Step 1: Define the dataset
# Assuming X_train_final and y_train_encoded are your features and target
X_train = X_train_final.copy()
y_train = y_train_encoded.copy()

print("Original class distribution:")
print(Counter(y_train))  # Check initial distribution

# Step 2: Identify classes and their counts
classes, counts = np.unique(y_train, return_counts=True)
max_count = counts.max()  # Count of the majority class

# Step 3: Initialize placeholders for synthetic samples
X_synthetic = []
y_synthetic = []

# Step 4: Oversample each minority class
k_neighbors = 5  # Number of neighbors for interpolation
for cls, count in zip(classes, counts):
    if count < max_count:  # Only oversample classes smaller than the majority class
        n_samples_to_generate = max_count - count  # Number of synthetic samples needed
        X_minority = X_train[y_train == cls]

        # Use NearestNeighbors to find neighbors in the minority class
        nbrs = NearestNeighbors(n_neighbors=k_neighbors).fit(X_minority)
        indices = nbrs.kneighbors(X_minority, return_distance=False)

        # Generate synthetic samples
        synthetic_samples = []
        for _ in range(n_samples_to_generate):
            idx = np.random.randint(0, X_minority.shape[0])  # Randomly pick a minority sample
            nn_idx = indices[idx][np.random.randint(1, k_neighbors)]  # Pick one of its neighbors
            diff = X_minority[nn_idx] - X_minority[idx]  # Difference vector
            synthetic_sample = X_minority[idx] + np.random.rand() * diff  # Interpolation
            synthetic_samples.append(synthetic_sample)

        synthetic_samples = np.array(synthetic_samples)

        # Append synthetic samples and labels to placeholders
        X_synthetic.append(synthetic_samples)
        y_synthetic.append(np.full(n_samples_to_generate, cls))

# Step 5: Combine synthetic samples with the original dataset
X_synthetic = np.vstack(X_synthetic) if X_synthetic else np.empty((0, X_train.shape[1]))
y_synthetic = np.hstack(y_synthetic) if y_synthetic else np.empty((0,))

X_train_balanced = np.vstack([X_train, X_synthetic])
y_train_balanced = np.hstack([y_train, y_synthetic])

# Step 6: Verify the new class distribution
print("\nBalanced class distribution:")
print(Counter(y_train_balanced))

# Step 7: Output shapes
print("\nBalanced dataset shapes:")
print(f"Features shape: {X_train_balanced.shape}")
print(f"Target shape: {y_train_balanced.shape}")

Original class distribution:
Counter({2: 39505, 1: 25621, 0: 8145})

Balanced class distribution:
Counter({2: 39505, 0: 39505, 1: 39505})

Balanced dataset shapes:
Features shape: (118515, 68)
Target shape: (118515,)


In [8]:
from sklearn.utils import shuffle

# Combine features and labels into a single dataset for shuffling
X_train_balanced, y_train_balanced = shuffle(X_train_balanced, y_train_balanced, random_state=42)

# Verify the shapes after shuffling
print("Shuffled dataset shapes:")
print(f"Features shape: {X_train_balanced.shape}")
print(f"Target shape: {y_train_balanced.shape}")



Shuffled dataset shapes:
Features shape: (118515, 68)
Target shape: (118515,)


In [None]:
# Verify the shapes before reshaping
print("Original shapes:")
print(f"Features shape: {X_train_balanced.shape}")
print(f"Target shape: {y_train_balanced.shape}")

# Reshape features to add the timesteps dimension (LSTM expects 3D input)
X_train_lstm = X_train_balanced.reshape(X_train_balanced.shape[0], 1, X_train_balanced.shape[1])  # 1 timestep
y_train_lstm = y_train_balanced  # Target shape doesn't need to change

# If you have a validation set, reshape it as well
X_val_lstm = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])  # Reshape validation features

# Verify the shapes after reshaping
print("Reshaped dataset shapes:")
print(f"LSTM Features shape: {X_train_lstm.shape}")
print(f"Target shape: {y_train_lstm.shape}")

LSTM model


In [None]:
# Define LSTM model
model = Sequential([
    Masking(mask_value=-1, input_shape=(X.shape[1], X.shape[2])),  # Handle padded/missing values
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y.shape[1], activation='softmax')  # Output layer for multi-class classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Training

In [None]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=128,
    callbacks=[early_stopping]
)

Evaluation



In [None]:
# Evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

In [None]:
# Save the model
model.save('lstm_readmission_model.h5')

# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model('lstm_readmission_model.h5')