### Simple RNN

In [2]:
! pip install torch

Collecting torch
  Downloading torch-2.4.1-cp38-cp38-win_amd64.whl (199.4 MB)
Installing collected packages: torch
Successfully installed torch-2.4.1


Here:

    We define sequences, a tensor containing sequences of numbers, and targets, which are the next numbers in each sequence. This is a basic sequence prediction task.

    sequences.unsqueeze(-1) adds a feature dimension. This makes each number in the sequence a feature vector, even though it’s just a scalar (dimension = 1).

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

# Sample input: Sequence of numbers
# Task: Predict the next number in the sequence

# Let's use a synthetic dataset: sequences of numbers where the task is to predict the next number
sequences = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6]], dtype=torch.float32)
targets = torch.tensor([5, 6, 7], dtype=torch.float32)


print("Input shape : ",sequences.shape) 

#many sequence models, such as RNNs, LSTMs, or transformers, expect the input to have a 3D shape: 
#(batch_size, sequence_length, input_features). The input_features dimension is required because 
#these models often expect each token in the sequence to have a feature vector.

sequences = sequences.unsqueeze(-1)  # Add a feature dimension for input
print("Input shape : ",sequences.shape) 

Input shape :  torch.Size([3, 4])
Input shape :  torch.Size([3, 4, 1])


In [11]:
# __init__ function:
# self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) creates an RNN layer. 
# The input_size is the number of features (1 in this case, since each number in the sequence is a scalar), 
# the hidden_size defines the size of the hidden state. 
# batch_first=True ensures that the input shape is (batch_size, sequence_length, input_size), making it more intuitive to work with.
# self.fc = nn.Linear(hidden_size, output_size) defines a fully connected (linear) layer that takes the RNN's hidden state output and maps it to the desired output (here, predicting the next number).

# forward function:
# The input x is passed through the RNN layer, which returns two values: out (the output at each time step) and hidden (the hidden state at the final time step). For simplicity, we don’t use the hidden state here, but it can be useful for more advanced tasks.
# out[:, -1, :] extracts the output from the last time step (we are interested in the prediction for the last token in the sequence).
# Finally, this output is passed through the fully connected layer (self.fc) to get the predicted next number.

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, hidden = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Take the last time step's output
        return out

    

# Model instantiation

# input_size=1 because each element in the sequence is a scalar.
# hidden_size=10 is the size of the hidden state, a tunable parameter that controls the capacity of the model.
# output_size=1 because the model predicts a single number (the next number in the sequence).
rnn_model = RNNModel(input_size=1, hidden_size=10, output_size=1)

# sets up the optimizer (Adam), which updates the model's parameters during training to minimize the loss.
optimizer = optim.Adam(rnn_model.parameters(), lr=0.01)

#  We use Mean Squared Error (MSE) since this is a regression task (predicting a number).    
criterion = nn.MSELoss()

# Training loop with prediction display
for epoch in range(200):
    rnn_model.train()  # Set model to training mode
    optimizer.zero_grad()  # Clear gradients
    
    # Forward pass
    outputs = rnn_model(sequences)
    
    # Calculate loss
    loss = criterion(outputs, targets.unsqueeze(1))
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()
    
    # Every 20 epochs, print loss and predictions
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Switch model to evaluation mode for prediction
        rnn_model.eval()
        with torch.no_grad():  # Disable gradient calculations for predictions
            predictions = rnn_model(sequences)
            print(f"Predictions: {predictions.squeeze().numpy()}")
            print(f"True Values: {targets.numpy()}")


Epoch 0, Loss: 37.076839447021484
Predictions: [0.1150898  0.11644439 0.11835956]
True Values: [5. 6. 7.]
Epoch 20, Loss: 13.387279510498047
Predictions: [2.5198996 2.5413985 2.5526867]
True Values: [5. 6. 7.]
Epoch 40, Loss: 3.3877270221710205
Predictions: [4.424144  4.4299173 4.4332113]
True Values: [5. 6. 7.]
Epoch 60, Loss: 0.8467133045196533
Predictions: [5.6048865 5.6096697 5.6123714]
True Values: [5. 6. 7.]
Epoch 80, Loss: 0.6623954176902771
Predictions: [6.0319357 6.036786  6.0395117]
True Values: [5. 6. 7.]
Epoch 100, Loss: 0.665977418422699
Predictions: [6.0610423 6.066118  6.0689626]
True Values: [5. 6. 7.]
Epoch 120, Loss: 0.6614015698432922
Predictions: [6.0105906 6.01597   6.0189743]
True Values: [5. 6. 7.]
Epoch 140, Loss: 0.6607268452644348
Predictions: [5.9903936 5.996175  5.999388 ]
True Values: [5. 6. 7.]
Epoch 160, Loss: 0.6601809859275818
Predictions: [5.991784  5.9981093 6.0015993]
True Values: [5. 6. 7.]
Epoch 180, Loss: 0.6593821048736572
Predictions: [5.9939537

### LSTM

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, (hidden, cell) = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Last time step
        return out

# Model instantiation
lstm_model = LSTMModel(input_size=1, hidden_size=10, output_size=1)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.01)
criterion = nn.MSELoss()

# Training loop with prediction display
for epoch in range(200):
    lstm_model.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = lstm_model(sequences)
    loss = criterion(outputs, targets.unsqueeze(1))
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()
    
    # Every 20 epochs, print loss and predictions
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Prediction step
        lstm_model.eval()
        with torch.no_grad():
            predictions = lstm_model(sequences)
            print(f"Predictions: {predictions.squeeze().numpy()}")
            print(f"True Values: {targets.numpy()}")


Epoch 0, Loss: 37.32528305053711
Predictions: [0.06150491 0.05338822 0.05241326]
True Values: [5. 6. 7.]
Epoch 20, Loss: 15.670123100280762
Predictions: [2.1711187 2.2454102 2.2832065]
True Values: [5. 6. 7.]
Epoch 40, Loss: 2.666585922241211
Predictions: [4.686811  4.7192626 4.7290587]
True Values: [5. 6. 7.]
Epoch 60, Loss: 0.6852652430534363
Predictions: [6.2105045 6.233781  6.2452507]
True Values: [5. 6. 7.]
Epoch 80, Loss: 0.6673634052276611
Predictions: [6.115544  6.137893  6.1488557]
True Values: [5. 6. 7.]
Epoch 100, Loss: 0.6457936763763428
Predictions: [5.9306   5.95434  5.965931]
True Values: [5. 6. 7.]
Epoch 120, Loss: 0.6410209536552429
Predictions: [5.9748173 6.0011587 6.013957 ]
True Values: [5. 6. 7.]
Epoch 140, Loss: 0.6374949812889099
Predictions: [5.982035  6.0121717 6.026728 ]
True Values: [5. 6. 7.]
Epoch 160, Loss: 0.6318579316139221
Predictions: [5.9666634 6.002833  6.0201893]
True Values: [5. 6. 7.]
Epoch 180, Loss: 0.6214486956596375
Predictions: [5.956663  6.0

### BiLSTM

In [13]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTMModel, self).__init__()
        self.bilstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # BiLSTM has 2x hidden size
    
    def forward(self, x):
        out, _ = self.bilstm(x)
        out = self.fc(out[:, -1, :])  # Last time step
        return out

# Model instantiation
bilstm_model = BiLSTMModel(input_size=1, hidden_size=10, output_size=1)
optimizer = optim.Adam(bilstm_model.parameters(), lr=0.01)
criterion = nn.MSELoss()

# Training loop with prediction display
for epoch in range(200):
    bilstm_model.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = bilstm_model(sequences)
    loss = criterion(outputs, targets.unsqueeze(1))
    
    # Backpropagation and optimization
    loss.backward()
    optimizer.step()
    
    # Every 20 epochs, print loss and predictions
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Prediction step
        bilstm_model.eval()
        with torch.no_grad():
            predictions = bilstm_model(sequences)
            print(f"Predictions: {predictions.squeeze().numpy()}")
            print(f"True Values: {targets.numpy()}")


Epoch 0, Loss: 36.85272216796875
Predictions: [0.07471006 0.08703961 0.097192  ]
True Values: [5. 6. 7.]
Epoch 20, Loss: 11.1943998336792
Predictions: [2.8022923 2.9271617 3.0100799]
True Values: [5. 6. 7.]
Epoch 40, Loss: 0.4933995306491852
Predictions: [5.997852  6.166354  6.2835183]
True Values: [5. 6. 7.]
Epoch 60, Loss: 0.577017068862915
Predictions: [6.1009264 6.2671776 6.3828936]
True Values: [5. 6. 7.]
Epoch 80, Loss: 0.505304217338562
Predictions: [5.7356434 5.9035044 6.0209093]
True Values: [5. 6. 7.]
Epoch 100, Loss: 0.4696296751499176
Predictions: [5.87276   6.0650563 6.200328 ]
True Values: [5. 6. 7.]
Epoch 120, Loss: 0.4311773478984833
Predictions: [5.7507224 5.98088   6.1493607]
True Values: [5. 6. 7.]
Epoch 140, Loss: 0.35201308131217957
Predictions: [5.66101   5.9833117 6.2254677]
True Values: [5. 6. 7.]
Epoch 160, Loss: 0.21636372804641724
Predictions: [5.507787  6.003061  6.3923345]
True Values: [5. 6. 7.]
Epoch 180, Loss: 0.08031509071588516
Predictions: [5.267324  

### Encoder-Decoder (LSTM)

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

# Sample input: Sequence of numbers
# Task: Predict the next number in the sequence
sequences = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6]], dtype=torch.float32)
targets = torch.tensor([5, 6, 7], dtype=torch.float32)
sequences = sequences.unsqueeze(-1)  # Add a feature dimension for input

# Encoder LSTM
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    
    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell  # Return the final hidden and cell states

# Decoder LSTM
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden, cell):
        outputs, (hidden, cell) = self.lstm(x, (hidden, cell))
        prediction = self.fc(outputs[:, -1, :])  # Take the output of the last time step
        return prediction, hidden, cell

# Hyperparameters
input_size = 1  # Since the input is just a single number per step
hidden_size = 10  # Size of the hidden state
output_size = 1   # Single value output
learning_rate = 0.01

# Instantiate models
encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
decoder = Decoder(hidden_size=hidden_size, output_size=output_size)

# Optimizer and loss function
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
criterion = nn.MSELoss()

# Training loop
for epoch in range(200):
    encoder.train()
    decoder.train()
    optimizer.zero_grad()
    
    # Encoder forward pass
    encoder_hidden, encoder_cell = encoder(sequences)
    
    # Decoder initial input (Use the hidden state as input to the decoder)
    # We'll use a zero tensor for the first input to the decoder (like a <start> token)
    decoder_input = torch.zeros((sequences.size(0), 1, hidden_size))  # Shape: (batch_size, 1, hidden_size)
    
    # Decoder forward pass (start with the hidden and cell states from the encoder)
    decoder_output, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell)
    
    # Compute loss
    loss = criterion(decoder_output, targets.unsqueeze(1))
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    # Every 20 epochs, print loss and predictions
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Evaluation mode for prediction
        encoder.eval()
        decoder.eval()
        with torch.no_grad():
            encoder_hidden, encoder_cell = encoder(sequences)
            decoder_input = torch.zeros((sequences.size(0), 1, hidden_size))  # Initial decoder input
            decoder_output, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell)
            print(f"Predictions: {decoder_output.squeeze().numpy()}")
            print(f"True Values: {targets.numpy()}")


Epoch 0, Loss: 35.594730377197266
Predictions: [0.1967281  0.17286184 0.14373879]
True Values: [5. 6. 7.]
Epoch 20, Loss: 14.406082153320312
Predictions: [2.4157283 2.444249  2.4600184]
True Values: [5. 6. 7.]
Epoch 40, Loss: 1.5141925811767578
Predictions: [5.168911  5.1777763 5.1827106]
True Values: [5. 6. 7.]
Epoch 60, Loss: 0.7221739292144775
Predictions: [6.2545414 6.2597113 6.262142 ]
True Values: [5. 6. 7.]
Epoch 80, Loss: 0.6752507090568542
Predictions: [6.097909  6.1026077 6.104924 ]
True Values: [5. 6. 7.]
Epoch 100, Loss: 0.6631235480308533
Predictions: [5.9584937 5.9633665 5.9658446]
True Values: [5. 6. 7.]
Epoch 120, Loss: 0.6614049077033997
Predictions: [5.9885497 5.9937882 5.996615 ]
True Values: [5. 6. 7.]
Epoch 140, Loss: 0.6608798503875732
Predictions: [6.0018063 6.007462  6.0106163]
True Values: [5. 6. 7.]
Epoch 160, Loss: 0.6602396368980408
Predictions: [5.994605  6.0007887 6.004321 ]
True Values: [5. 6. 7.]
Epoch 180, Loss: 0.6594054698944092
Predictions: [5.994455

### Encoder-Decoder with Attention

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim

# Sample input: Sequence of numbers
# Task: Predict the next number in the sequence
sequences = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6]], dtype=torch.float32)
targets = torch.tensor([5, 6, 7], dtype=torch.float32)
sequences = sequences.unsqueeze(-1)  # Add a feature dimension for input

# Encoder LSTM
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    
    def forward(self, x):
        encoder_outputs, (hidden, cell) = self.lstm(x)
        return encoder_outputs, hidden, cell  # Return all encoder outputs, hidden and cell states

# Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)  # Linear layer for attention weights
        self.v = nn.Parameter(torch.rand(hidden_size))  # Context vector v
    
    def forward(self, hidden, encoder_outputs):
        # Repeat the hidden state across the sequence length to compute attention scores
        hidden = hidden[-1].unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)  # Shape: (batch_size, seq_len, hidden_size)
        
        # Calculate energy scores (similarity between hidden state and encoder outputs)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # Shape: (batch_size, seq_len, hidden_size)
        
        # Compute attention scores
        attention = torch.sum(self.v * energy, dim=2)  # Shape: (batch_size, seq_len)
        attention_weights = torch.softmax(attention, dim=1)  # Normalize with softmax
        
        # Compute context vector (weighted sum of encoder outputs)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # Shape: (batch_size, 1, hidden_size)
        return context, attention_weights

# Decoder LSTM with Attention
class DecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderWithAttention, self).__init__()
        self.lstm = nn.LSTM(hidden_size * 2, hidden_size, batch_first=True)  # Input is concatenation of context + decoder input
        self.attention = Attention(hidden_size)  # Attention mechanism
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for predictions
    
    def forward(self, x, hidden, cell, encoder_outputs):
        # Calculate the context vector using the attention mechanism
        context, attention_weights = self.attention(hidden, encoder_outputs)
        
        # Concatenate the context vector and the input to the decoder
        lstm_input = torch.cat((x, context), dim=2)  # Shape: (batch_size, 1, hidden_size * 2)
        
        # Pass through the LSTM
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        
        # Generate the final prediction
        prediction = self.fc(output[:, -1, :])  # Shape: (batch_size, output_size)
        return prediction, hidden, cell, attention_weights

# Hyperparameters
input_size = 1  # Since the input is just a single number per step
hidden_size = 10  # Size of the hidden state
output_size = 1   # Single value output
learning_rate = 0.01

# Instantiate models
encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
decoder = DecoderWithAttention(hidden_size=hidden_size, output_size=output_size)

# Optimizer and loss function
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
criterion = nn.MSELoss()

# Training loop
for epoch in range(200):
    encoder.train()
    decoder.train()
    optimizer.zero_grad()
    
    # Encoder forward pass
    encoder_outputs, encoder_hidden, encoder_cell = encoder(sequences)
    
    # Initial decoder input (a zero tensor)
    decoder_input = torch.zeros((sequences.size(0), 1, hidden_size))  # Shape: (batch_size, 1, hidden_size)
    
    # Decoder forward pass with attention
    decoder_output, _, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell, encoder_outputs)
    
    # Compute loss
    loss = criterion(decoder_output, targets.unsqueeze(1))
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    # Every 20 epochs, print loss and predictions
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        
        # Evaluation mode for prediction
        encoder.eval()
        decoder.eval()
        with torch.no_grad():
            encoder_outputs, encoder_hidden, encoder_cell = encoder(sequences)
            decoder_input = torch.zeros((sequences.size(0), 1, hidden_size))  # Initial decoder input
            decoder_output, _, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell, encoder_outputs)
            print(f"Predictions: {decoder_output.squeeze().numpy()}")
            print(f"True Values: {targets.numpy()}")


Epoch 0, Loss: 38.947322845458984
Predictions: [-0.13270167 -0.10514794 -0.08190268]
True Values: [5. 6. 7.]
Epoch 20, Loss: 11.377039909362793
Predictions: [2.8632393 2.8837392 2.8905833]
True Values: [5. 6. 7.]
Epoch 40, Loss: 1.543578028678894
Predictions: [5.140058  5.1433935 5.144601 ]
True Values: [5. 6. 7.]
Epoch 60, Loss: 0.6688856482505798
Predictions: [6.0871305 6.0900364 6.091098 ]
True Values: [5. 6. 7.]
Epoch 80, Loss: 0.6841444373130798
Predictions: [6.1327615 6.135635  6.136684 ]
True Values: [5. 6. 7.]
Epoch 100, Loss: 0.66417396068573
Predictions: [6.0043592 6.007227  6.008275 ]
True Values: [5. 6. 7.]
Epoch 120, Loss: 0.6643080115318298
Predictions: [5.9811273 5.9840503 5.9851174]
True Values: [5. 6. 7.]
Epoch 140, Loss: 0.6639390587806702
Predictions: [5.997347  6.0003533 6.0014496]
True Values: [5. 6. 7.]
Epoch 160, Loss: 0.6638632416725159
Predictions: [5.999854  6.0029507 6.00408  ]
True Values: [5. 6. 7.]
Epoch 180, Loss: 0.6637632846832275
Predictions: [5.997430