# Bidirectional Long short-term memory (BiLSTM)

Long Short-Term Memory (BiLSTM) network is an extension of the standard LSTM network that improves the ability to learn from sequential data by processing it in both forward and backward directions. In a BiLSTM, two LSTM networks are used:

- Forward LSTM: Processes the sequence in the normal order (from start to end).
- Backward LSTM: Processes the sequence in the reverse order (from end to start).

**How It Works**
1. Data Flow:
    - Forward Pass: The forward LSTM reads the sequence from the beginning to the end, capturing the dependencies in that direction.
    - Backward Pass: The backward LSTM reads the sequence from the end to the beginning, capturing the dependencies in the reverse direction.
2. Concatenation: The outputs of the forward and backward LSTMs are usually concatenated or combined in some way. This combined representation contains information from both directions, allowing the model to understand context from both past and future relative to each point in the sequence.



Some documentation
- [Wikipedia - LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory)
- [Pytorch - LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)


# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model
In this application the RNN we can view the images from the MNIST dataset (28x28) as 28 time sequences with 28 features. Of course, normally we wouldn use RNN with images, there are way better architectures for this image related tasks.

In [21]:
# Hyperparameter
input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 2
use_all_hidden_layers = 0
    
# Create the model - Using all hidden states
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)
        
    def forward(self, x):
        # initialization of hidden states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) # (num_layers, N_mini_batches, hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        # forward prop
        out, _ = self.lstm(x, (h0, c0)) 
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out
    
# Create the model - Using only the last hidden state
class RNN_last(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN_last, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # initialization of hidden states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) # (num_layers, N_mini_batches, hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        # forward prop
        out, _ = self.lstm(x, (h0, c0)) 
        out = self.fc(out[:,-1,:]) # Take all training examples, the last hidden state, all features
        return out

maybe we want to use only the last hidden state, bc it has info about all the previous hidden states, the result is probably going to be worse, but could more eficient compute-wise.

# Load Data
- MNIST: 28x28 pixels
- When we load the dataset, the shape will be (batch_size, 1, 28, 28)

In [22]:
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Training

In [23]:
# Intialize NN
if use_all_hidden_layers == 1:
    model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
    print("Using all hidden layers")
else:
    model = RNN_last(input_size, hidden_size, num_layers, num_classes).to(device)
    print("Using only the last hidden layers")

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the NN
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to device
        data = data.to(device=device).squeeze(1) #remove the dimention 1 in (Nx1x28x28)
        targets = targets.to(device=device)

        # Forward propagation
        scores = model(data)
        loss = criterion(scores, targets)

        # Backward propagation
        optimizer.zero_grad() # initialize all gradients to zero for each batch
        loss.backward()

        # Gradient descent or Adam step
        optimizer.step()

Using only the last hidden layers


# Performance

In [24]:
# Check accuracy on training and test sets
def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on test data")
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)  # scores is 64x10 and we want to know which one of those the is the maximum value, so in max: dim=1
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f'got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
    model.train()

In [25]:
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

Checking accuracy on training data
got 58613 / 60000 with accuracy 97.69
Checking accuracy on test data
got 9767 / 10000 with accuracy 97.67
