In [1]:
import pandas as pd
import pyarrow
import matplotlib.pyplot as plt

import sentencepiece
import transformers
from transformers import AutoTokenizer, AutoModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchsummary import summary

from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

print(pyarrow.__version__, transformers.__version__, sentencepiece.__version__)

19.0.1 4.50.3 0.2.0


In [2]:
vocab_size = 30522  # Example vocab size (from tokenizer)
embedding_dim = 768  # Embedding dimension
hidden_dim = 512  # Hidden dimension for GRU
MAX_LENGTH = 50  # Define max length for your sequences
SOS_token = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [4]:
def read_dataset(split):
    return pd.read_parquet(f"iwslt2015-en-vi/{split}-00000-of-00001.parquet")

train = read_dataset("train")
validation = read_dataset("validation")
test = read_dataset("test")

# full_train = pd.concat([train, validation], ignore_index=True)

# train.shape, validation.shape, full_train.shape, test.shape

In [4]:
class Seq2SeqDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_len=50):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]

        # Tokenize source text and target text with padding
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=False
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',  # Pad to max_len
            truncation=True,
            return_tensors='pt',
            return_attention_mask=False
        )

        # Move the tensors to the device (cuda or cpu)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Return tokenized input and target sequences moved to the proper device
        return {
            'source_input_ids': source_encoding['input_ids'].squeeze(0).to(device),
            'target_input_ids': target_encoding['input_ids'].squeeze(0).to(device),
            'target_labels': target_encoding['input_ids'].squeeze(0).to(device)  # For training targets
        }


### Bidirectional RNNs (Recurrent Newral Networks)

Process the input sequence in two directions:

    1. Forward direction (l-r): as normal RNN works
    
    2. Backward direction (r-l): the sequence is processed from the last token and going back to the beginning

#### h_t = [h_t^f, h_t^b]



In [5]:
class EncoderGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout_p=0.1, bidirectional=True):
        """
        Encoder GRU with optional bidirectional support.

        Parameters:
        - vocab_size (int): The size of the vocabulary for the input text.
        - embedding_dim (int): The size of the embedding vector.
        - hidden_dim (int): The number of hidden units in the GRU layer.
        - num_layers (int): The number of layers in the GRU. Default is 2.
        - dropout_p (float): Dropout rate for regularization.
        - bidirectional (bool): Whether to use a bidirectional GRU. Default is True.

        Shapes:
        - embedding.shape = [batch_size, seq_len, embedding_dim]
        - gru_out.shape = [batch_size, seq_len, hidden_dim * num_directions (2 if bidirectional else 1)]
        - hidden.shape (before bidirectional) = [num_layers * num_directions, batch_size, hidden_dim]
        - If bidirectional: hidden.shape = [batch_size, hidden_dim * 2] (concat two directions)
        - If not bidirectional: hidden.shape = [batch_size, hidden_dim] (last hidden state only)
        - hidden.shape (after fc): [batch_size, hidden_dim] (optional if having bidirectional)
        """

        super(EncoderGRU, self).__init__()

        # Embedding layer: Converts word indices to dense vectors of size embedding_dim
        # Input shape: [batch_size, seq_len] -> Output shape: [batch_size, seq_len, embedding_dim]:Reason
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # GRU Layer (with optional bidirectionality and dropout)
        # Input shape: [batch_size, seq_len, embedding_dim] -> Output shape: [batch_size, seq_len, hidden_dim * num_directions] 
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
                          batch_first=True, dropout=dropout_p, bidirectional=bidirectional)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_p)

        # Fully connected layer to adjust hidden state size (optional)
        # If bidirectional, input size is hidden_dim * 2, else hidden_dim
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)

    def forward(self, x):
        """
        Forward pass through the Encoder GRU.

        Args:
        - x (Tensor): The input tensor of shape [batch_size, seq_len], where each element is a token ID.

        Returns:
        - gru_out (Tensor): The output of the GRU for each time step, shape [batch_size, seq_len, hidden_dim * num_directions]
        - hidden (Tensor): The final hidden state after the GRU, shape [batch_size, hidden_dim]
        """

        # Pass input through embedding layer and apply dropout
        embedded = self.dropout(self.embedding(x))  # Shape: [batch_size, seq_len, embedding_dim]

        # Pass the embedded input through the GRU
        gru_out, hidden = self.gru(embedded)  # gru_out: [batch_size, seq_len, hidden_dim * num_directions]

        # If the GRU is bidirectional, concatenate the final hidden states from both directions
        if self.gru.bidirectional:
            # Hidden state shape before bidirectional handling: [num_layers * 2, batch_size, hidden_dim]
            # We concatenate the last hidden state from the forward and backward directions
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)  # Shape: [batch_size, hidden_dim * 2]
        else:
            # If not bidirectional, just take the final hidden state (last time step)
            hidden = hidden[-1,:,:]  # Shape: [batch_size, hidden_dim]

        # Optionally apply a fully connected layer to the final hidden state (for dimensionality adjustment)
        hidden = self.fc(hidden)  # Shape: [batch_size, hidden_dim]

        # Optionally apply a fully connected layer to the GRU output (to project it to another space)
        # You can decide if you need this, but it can help adjust the shape if needed
        gru_out = self.fc(gru_out)  # Shape: [batch_size, seq_len, hidden_dim]

        return gru_out, hidden  # Return the GRU outputs and the final hidden state


In [6]:
# question 1: what output size should I use

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  # Change to your tokenizer

def get_dataloader(data, batch_size=64, shuffle=True):
    # Create Dataset and DataLoader
    dataset = Seq2SeqDataset(data["en"], data["vi"], tokenizer)
    # Use a more typical batch size (adjust based on your memory constraints)
    return DataLoader(dataset, batch_size, shuffle)

train_loader = get_dataloader(train)
# dir(train_loader)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
vocab_size = len(tokenizer)  # 30522 for BERT's vocab size
print(f"Tokenizer vocab size: {vocab_size}")

model = AutoModel.from_pretrained('bert-base-uncased')
embedding_dim = model.config.hidden_size
print(f"Embedding dimension: {embedding_dim}")

# Now, ensure your model has the correct vocab_size in the embedding layer
encoder = EncoderGRU(vocab_size, embedding_dim=embedding_dim, hidden_dim=512)
encoder, next(encoder.parameters()).device

Tokenizer vocab size: 30522


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Embedding dimension: 768


(EncoderGRU(
   (embedding): Embedding(30522, 768)
   (gru): GRU(768, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
   (dropout): Dropout(p=0.1, inplace=False)
   (fc): Linear(in_features=1024, out_features=512, bias=True)
 ),
 device(type='cpu'))

In [9]:
encoder.to(device)
next(encoder.parameters()).device

device(type='cuda', index=0)

In [10]:
# for batch in train_loader:
#     batch
#     first_batch = batch["source_input_ids"]
#     break
# print(first_batch.shape)
# gru_out, hidden = encoder(first_batch)

In [11]:
# gru_out.shape, hidden.shape

In [12]:
class LuongAttention(nn.Module):
    def __init__(self, hidden_dim):
        """
        Initialize the Luong Attention mechanism.

        Parameters:
        - hidden_dim (int): The dimension of the hidden states from the encoder and decoder.
        """
        super(LuongAttention, self).__init__()
        # Attention layer that learns a linear transformation for the decoder hidden state
        self.attn = nn.Linear(hidden_dim, hidden_dim)  # Projection layer for the decoder hidden state

    def forward(self, encoder_outputs, decoder_hidden):
        """
        Forward pass for the attention mechanism.

        Args:
        - encoder_outputs (Tensor): The encoder outputs of shape [batch_size, seq_len, hidden_dim * 2]
        - decoder_hidden (Tensor): The decoder's current hidden state of shape [batch_size, hidden_dim]

        Returns:
        - context_vector (Tensor): The weighted sum of encoder outputs, shaped [batch_size, hidden_dim * 2]
        - attention_weights (Tensor): The attention weights for each encoder output, shaped [batch_size, seq_len]
        """
        batch_size = encoder_outputs.shape[0]
        seq_len = encoder_outputs.shape[1]

        # Project decoder hidden state into the same dimensionality as the encoder outputs
        query = self.attn(decoder_hidden).unsqueeze(1)  # Shape: [batch_size, 1, hidden_dim]

        # Compute attention scores by performing a dot-product between the query (decoder hidden state)
        # and the encoder outputs (keys). This results in energy scores.
        energy = torch.bmm(query, encoder_outputs.transpose(1, 2))  # Shape: [batch_size, 1, seq_len]

        # Apply softmax to compute the attention weights, normalizing them over the sequence length.
        attention_weights = torch.softmax(energy.squeeze(1), dim=1)  # Shape: [batch_size, seq_len]

        # Compute the context vector as the weighted sum of encoder outputs based on the attention weights
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # Shape: [batch_size, 1, hidden_dim * 2]
        context_vector = context_vector.squeeze(1)  # Shape: [batch_size, hidden_dim * 2]

        # Return the context vector and attention weights
        return context_vector, attention_weights


In [13]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        """
        Initialize the Bahdanau Attention mechanism.

        Parameters:
        - hidden_size (int): The dimension of the hidden state in the encoder and decoder.
        """
        super(BahdanauAttention, self).__init__()

        # Linear transformations for the query (decoder hidden state) and keys (encoder outputs)
        self.Wa = nn.Linear(hidden_size, hidden_size)  # For the query (decoder hidden state)
        self.Ua = nn.Linear(hidden_size, hidden_size)  # For the keys (encoder outputs)
        self.Va = nn.Linear(hidden_size, 1)            # Final layer for the attention score

    def forward(self, keys, query):
        """
        Compute the attention scores based on the decoder hidden state (query) and encoder outputs (keys).

        Args:
        - query (Tensor): The decoder hidden state of shape [batch_size, hidden_size].
        - keys (Tensor): The encoder outputs of shape [batch_size, seq_len, hidden_size].

        Returns:
        - context_vector (Tensor): The weighted sum of encoder outputs, shape [batch_size, hidden_size].
        - attention_weights (Tensor): The attention weights for each encoder output, shape [batch_size, seq_len].
        """

        # Transform the query (decoder hidden state) and keys (encoder outputs)
        query_transformed = self.Wa(query).unsqueeze(1)  # Shape: [batch_size, 1, hidden_size]
        keys_transformed = self.Ua(keys)  # Shape: [batch_size, seq_len, hidden_size]

        # Compute the attention scores using the transformed query and keys
        # This adds the query and keys, passes through a tanh activation, and then a final linear layer
        scores = self.Va(torch.tanh(query_transformed + keys_transformed))  # Shape: [batch_size, seq_len, 1]

        # Squeeze the last dimension to get attention scores of shape [batch_size, seq_len]
        scores = scores.squeeze(2)  # Shape: [batch_size, seq_len]

        # Apply softmax to get attention weights, normalizing them over the sequence length (seq_len)
        attention_weights = F.softmax(scores, dim=-1)  # Shape: [batch_size, seq_len]

        # Compute the context vector as the weighted sum of encoder outputs based on attention weights
        # Attention weights are multiplied with the encoder outputs
        context_vector = torch.bmm(attention_weights.unsqueeze(1), keys)  # Shape: [batch_size, 1, hidden_size]

        # Remove the extra dimension to get the final context vector shape: [batch_size, hidden_size]
        context_vector = context_vector.squeeze(1)  # Shape: [batch_size, hidden_size]

        # Return the context vector and the attention weights
        return context_vector, attention_weights


In [14]:
# class AttnDecoderGRU(nn.Module):
#     def __init__(self, hidden_size, vocab_size, attention_type="Bahdanau", dropout_p=0.1, num_layers=2, bidirectional=False):
#         super(AttnDecoderGRU, self).__init__()

#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.bidirectional = bidirectional

#         # Embedding layer for target tokens
#         self.embedding = nn.Embedding(vocab_size, hidden_size)
#         # Attention mechanism (Bahdanau or Luong)
#         if attention_type == 'Luong':
#             self.attention = LuongAttention(hidden_size)
#         elif attention_type == 'Bahdanau':
#             self.attention = BahdanauAttention(hidden_size)
#         else:
#             raise ValueError("Unknown attention type")

#         # GRU for generating output sequence
#         self.gru = nn.GRU(hidden_size * 2, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional)


#         # self.gru = nn.GRU(hidden_size + hidden_size * 2, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional)

#         # Linear layer to project GRU output to vocab size (adjusted for bidirectional GRU)
#         # If bidirectional GRU, hidden_size * 2 (1024) should be used
#         self.out = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, vocab_size)

#         # Dropout layer for regularization
#         self.dropout = nn.Dropout(dropout_p)

#         self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)

#     def forward(self, decoder_input, decoder_hidden, encoder_outputs, target_tensor=None):
#         batch_size = encoder_outputs.size(0)
#         device = encoder_outputs.device

#         # Store outputs and attention weights for visualization
#         decoder_outputs = []
#         attentions = []

#         # Initialize the first decoder input (SOS token)
#         decoder_input = torch.full((batch_size, 1), SOS_token, dtype=torch.long, device=device)

#         for i in range(MAX_LENGTH):
#             print(f"Decoder hidden shape at timestep {i}:", decoder_hidden.shape)
#             print(encoder_outputs.shape)

#             # Compute the context vector and attention weights
#             context, attn_weights = self.attention(encoder_outputs, decoder_hidden)
#             print("Context shape", context.shape)
#             # Embed the current decoder input token
#             embedded = self.embedding(decoder_input)
#             print("Decoder_input shape:", decoder_input.shape)
#             embedded = self.dropout(embedded)

#             context = context.unsqueeze(1)  # Adding sequence dimension to context
#             print("E C", embedded.shape, context.shape)

#             # Concatenate context vector with embedded token input
#             input_gru = torch.cat((embedded, context), dim=2)
#             print(input_gru.shape)
#             # Ensure the hidden state is 3D for the GRU
#             if self.bidirectional:
#                 decoder_hidden = decoder_hidden.unsqueeze(0).repeat(self.num_layers * 2, 1, 1)
#             else:
#                 decoder_hidden = decoder_hidden.unsqueeze(0).repeat(self.num_layers, 1, 1)
#             print(input_gru.shape, decoder_hidden.shape)
#             # Pass through GRU
#             output, decoder_hidden = self.gru(input_gru, decoder_hidden)
#             print(f"After GRU, decoder_hidden shape: {decoder_hidden.shape}")  # Track shape

#             # After GRU, recover `decoder_hidden` to 2D (for attention)
#             if self.bidirectional:
#                 # For bidirectional GRU, take the last two hidden states, concatenate them, and flatten
#                 decoder_hidden = decoder_hidden[-2:, :, :].transpose(0, 1).contiguous().view(batch_size, -1)
#             else:
#                 # For unidirectional GRU, just take the last hidden state
#                 decoder_hidden = decoder_hidden[-1, :, :]

#             print(f"Recovered decoder_hidden shape for attention: {decoder_hidden.shape}")

#             # Apply FC layer to get the output logits (adjusted for bidirectional GRU)
#             output = self.out(output.squeeze(1))  # (batch_size, vocab_size)

#             # Apply FC layer to decoder_hidden
#             decoder_hidden = self.fc(decoder_hidden)
#             print(f"After FC layer, decoder_hidden shape: {decoder_hidden.shape}")

#             # Store output and attention weights for each timestep
#             decoder_outputs.append(output)
#             attentions.append(attn_weights)

#             if target_tensor is not None:
#                 # Teacher forcing: Use true token as the next input
#                 decoder_input = target_tensor[:, i].unsqueeze(1)
#             else:
#                 # Use predicted token as the next input (inference)
#                 _, topi = output.topk(1)
#                 decoder_input = topi.squeeze(-1).unsqueeze(1).detach()  # Unsqueeze to get shape [batch_size, 1]

#         # Concatenate decoder outputs for all timesteps
#         decoder_outputs = torch.stack(decoder_outputs, dim=1)  # (batch_size, MAX_LENGTH, vocab_size)
#         attentions = torch.stack(attentions, dim=1)  # (batch_size, MAX_LENGTH, seq_len)

#         return decoder_outputs, decoder_hidden, attentions


In [1]:
class AttnDecoderGRU(nn.Module):
    def __init__(self, hidden_size, vocab_size, attention_type, dropout_p=0.1, num_layers=2, bidirectional=True):
        super(AttnDecoderGRU, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        # Embedding layer for target tokens
        # This will map token indices to dense vectors of size `hidden_size`
        self.embedding = nn.Embedding(vocab_size, hidden_size)

        # Choose the attention mechanism (Bahdanau or Luong)
        if attention_type == 'Luong':
            self.attention = LuongAttention(hidden_size)
        elif attention_type == 'Bahdanau':
            self.attention = BahdanauAttention(hidden_size)
        else:
            raise ValueError("Unknown attention type")

        # GRU layer for generating output sequence
        # Input size to the GRU is hidden_size + context_size
        self.gru = nn.GRU(hidden_size * 2, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_p, bidirectional=bidirectional)

        # Linear layer to project GRU output to vocab size
        # If bidirectional GRU, we need to use hidden_size * 2 because GRU output will have twice the hidden size
        self.out = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, vocab_size)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_p)

        # Fully connected layer for adjusting decoder hidden state dimension (optional)
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, hidden_size)

    def forward(self, decoder_input, decoder_hidden, encoder_outputs, target_tensor=None, device='cuda'):

        decoder_input = decoder_input.to(device)
        decoder_hidden = decoder_hidden.to(device)
        encoder_outputs = encoder_outputs.to(device)

        batch_size = encoder_outputs.size(0)  # Get batch size from encoder outputs

        # Store outputs and attention weights for visualization or further processing
        decoder_outputs = []
        attentions = []

        # Initialize the first decoder input (SOS token)
        decoder_input = torch.full((batch_size, 1), SOS_token, dtype=torch.long, device=device)

        for i in range(MAX_LENGTH):  # Iterate over all timesteps (max length of output)

            # Compute the context vector and attention weights
            context, attn_weights = self.attention(encoder_outputs, decoder_hidden)

            # Embed the current decoder input token (converted to hidden_size dimension)
            embedded = self.embedding(decoder_input)
            embedded = self.dropout(embedded)  # Apply dropout for regularization

            # Ensure context has the correct shape for concatenation
            context = context.unsqueeze(1)  # Add sequence dimension to context: (batch_size, 1, hidden_size * 2)

            # Concatenate context vector with embedded token input
            input_gru = torch.cat((embedded, context), dim=2)  # Concatenate along the feature dimension

            # Ensure the hidden state is 3D for the GRU (num_layers, batch_size, hidden_size)
            if self.bidirectional:
                decoder_hidden = decoder_hidden.unsqueeze(0).repeat(self.num_layers * 2, 1, 1)  # For bidirectional
            else:
                decoder_hidden = decoder_hidden.unsqueeze(0).repeat(self.num_layers, 1, 1)  # For unidirectional

            # Pass through the GRU layer
            output, decoder_hidden = self.gru(input_gru, decoder_hidden)

            # After GRU, recover `decoder_hidden` to 2D for attention (batch_size, hidden_size * 2 if bidirectional)
            if self.bidirectional:
                # For bidirectional GRU, concatenate the last two hidden states (forward and backward)
                decoder_hidden = decoder_hidden[-2:, :, :].transpose(0, 1).contiguous().view(batch_size, -1)
            else:
                # For unidirectional GRU, just take the last hidden state
                decoder_hidden = decoder_hidden[-1, :, :]

            # Apply FC layer to adjust the decoder hidden state (optional)
            decoder_hidden = self.fc(decoder_hidden)

            # Apply output layer (projection to vocab size)
            output = self.out(output.squeeze(1))  # (batch_size, vocab_size)

            # Store output and attention weights for each timestep
            decoder_outputs.append(output)
            attentions.append(attn_weights)

            # Handle teacher forcing: Use true token as the next input during training
            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                # In inference mode: Use predicted token as the next input
                _, topi = output.topk(1)  # Get top predicted token
                decoder_input = topi.squeeze(-1).unsqueeze(1).detach()  # Unsqueeze to get shape [batch_size, 1]

        # Concatenate decoder outputs for all timesteps
        decoder_outputs = torch.stack(decoder_outputs, dim=1)  # (batch_size, MAX_LENGTH, vocab_size)
        attentions = torch.stack(attentions, dim=1)  # (batch_size, MAX_LENGTH, seq_len)

        return decoder_outputs, decoder_hidden, attentions


NameError: name 'nn' is not defined

In [16]:
decoder = AttnDecoderGRU(hidden_dim, vocab_size, "Luong")
decoder, next(decoder.parameters()).device

(AttnDecoderGRU(
   (embedding): Embedding(30522, 512)
   (attention): LuongAttention(
     (attn): Linear(in_features=512, out_features=512, bias=True)
   )
   (gru): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
   (out): Linear(in_features=512, out_features=30522, bias=True)
   (dropout): Dropout(p=0.1, inplace=False)
   (fc): Linear(in_features=512, out_features=512, bias=True)
 ),
 device(type='cpu'))

In [17]:
decoder.to(device), next(decoder.parameters()).device

(AttnDecoderGRU(
   (embedding): Embedding(30522, 512)
   (attention): LuongAttention(
     (attn): Linear(in_features=512, out_features=512, bias=True)
   )
   (gru): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
   (out): Linear(in_features=512, out_features=30522, bias=True)
   (dropout): Dropout(p=0.1, inplace=False)
   (fc): Linear(in_features=512, out_features=512, bias=True)
 ),
 device(type='cuda', index=0))

In [18]:
# input = batch['source_input_ids'].to(device)
# decoder_outputs, decoder_hidden, attentions = decoder(input, hidden, gru_out)

In [19]:
# decoder_outputs.shape, decoder_hidden.shape, attentions.shape

In [20]:
# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder, device):
#         super(Seq2Seq, self).__init__()

#         self.encoder = encoder
#         self.decoder = decoder
#         self.device = device

#     def forward(self, source, target, teacher_forcing_ratio=0.5):
#         # Encoder step: Pass the source sequence through the encoder
#         encoder_outputs, hidden = self.encoder(source)

#         # Decoder step: Initialize decoder input (SOS token)
#         input = target[:, 0]
#         outputs = []

#         for t in range(1, target.size(1)):  # Iterate over each time step in the target sequence
#             # Pass encoder outputs and hidden state to the decoder
#             decoder_output, hidden, attentions_timestep = self.decoder(input, hidden, encoder_outputs)

#             outputs.append(decoder_output)
#             input = target[:, t] if torch.rand(1).item() < teacher_forcing_ratio else decoder_output.argmax(dim=2)

#         # Concatenate all outputs for the full sequence
#         outputs = torch.cat(outputs, dim=1)
#         return outputs


In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        # Encoder step: Pass the source sequence through the encoder
        encoder_outputs, hidden = self.encoder(source)

        # Decoder step: Initialize decoder input (SOS token)
        input = target[:, 0]  # Starting with the first token from the target sequence
        batch_size = source.size(0)
        trg_len = target.size(1)
        output_dim = vocab_size

        # Initialize an empty tensor to hold the outputs for each time step
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        for t in range(1, trg_len):  # Iterate over each time step in the target sequence
            # Pass encoder outputs and hidden state to the decoder
            decoder_output, hidden, _ = self.decoder(input, hidden, encoder_outputs)

            # Since decoder_output is already of shape [batch_size, trg_len, output_dim],
            # we can assign it to the outputs tensor without any additional steps.
            outputs[:, t, :] = decoder_output[:, t, :]  # Assigning one token's output at a time

            # Decide whether to use teacher forcing or not
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = decoder_output.argmax(dim=2)  # Get the most likely next token

            # Use teacher forcing or predicted token
            input = target[:, t] if teacher_force else top1.squeeze(1)

        return outputs


In [22]:
seq2seq = Seq2Seq(encoder, decoder, device)
seq2seq, next(seq2seq.parameters()).device

(Seq2Seq(
   (encoder): EncoderGRU(
     (embedding): Embedding(30522, 768)
     (gru): GRU(768, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
     (dropout): Dropout(p=0.1, inplace=False)
     (fc): Linear(in_features=1024, out_features=512, bias=True)
   )
   (decoder): AttnDecoderGRU(
     (embedding): Embedding(30522, 512)
     (attention): LuongAttention(
       (attn): Linear(in_features=512, out_features=512, bias=True)
     )
     (gru): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
     (out): Linear(in_features=512, out_features=30522, bias=True)
     (dropout): Dropout(p=0.1, inplace=False)
     (fc): Linear(in_features=512, out_features=512, bias=True)
   )
 ),
 device(type='cuda', index=0))

In [23]:
seq2seq.to(device), next(seq2seq.parameters()).device

(Seq2Seq(
   (encoder): EncoderGRU(
     (embedding): Embedding(30522, 768)
     (gru): GRU(768, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
     (dropout): Dropout(p=0.1, inplace=False)
     (fc): Linear(in_features=1024, out_features=512, bias=True)
   )
   (decoder): AttnDecoderGRU(
     (embedding): Embedding(30522, 512)
     (attention): LuongAttention(
       (attn): Linear(in_features=512, out_features=512, bias=True)
     )
     (gru): GRU(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
     (out): Linear(in_features=512, out_features=30522, bias=True)
     (dropout): Dropout(p=0.1, inplace=False)
     (fc): Linear(in_features=512, out_features=512, bias=True)
   )
 ),
 device(type='cuda', index=0))

In [24]:
# outputs = seq2seq(batch['source_input_ids'].to('cuda'),  batch['target_input_ids'].to('cuda'), 0.5)

In [25]:
# outputs.shape

In [26]:
def compute_bleu(predictions, targets):
    # If predictions and targets are numpy arrays, ensure you iterate over each sequence.
    # Convert numpy arrays to lists of strings if needed.
    pred = [[str(token).split()] for token in predictions]  # Ensure token is a string before applying split
    ref = [[str(token).split()] for token in targets]       # Same for targets
    return corpus_bleu(ref, pred)

In [27]:
def plot_metrics(train_losses, val_losses, bleu_scores):
    epochs = len(train_losses)

    # Plot Train Loss vs Validation Loss
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(range(epochs), train_losses, label='Train Loss', color='blue')
    plt.plot(range(epochs), val_losses, label='Validation Loss', color='red')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    # Plot BLEU score over epochs
    plt.subplot(1, 2, 2)
    plt.plot(range(epochs), bleu_scores, label='Validation BLEU Score', color='green')
    plt.xlabel('Epochs')
    plt.ylabel('BLEU Score')
    plt.title('Validation BLEU Score over Epochs')
    plt.legend()

    # Show the plots
    plt.tight_layout()
    plt.show()


In [28]:
import torch

# Define the path to save the model
MODEL_PATH = 'full_seq2seq_model.pth'

def save_full_model(model, optimizer, epoch, loss, model_path=MODEL_PATH):
    # Save the entire model, optimizer, epoch, and loss
    torch.save({
        'epoch': epoch,
        'model': model,  # Save the entire model including architecture and weights
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, model_path)
    print(f"Full model saved to {model_path}")


In [29]:
EPOCHS = 1
BATCH_SIZE = 64
LEARNING_RATE = 0.001
TEACHER_FORCING_RATIO = 0.5
MAX_LENGTH = 50
PAD_token = 0

train_loader = get_dataloader(train)
val_loader = get_dataloader(validation)
test_loader = get_dataloader(test)

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10, teacher_forcing_ratio=0.5, device='cuda'):
    train_losses = []
    val_losses = []
    bleu_scores = []

    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0

        # Train step
        for batch in tqdm(train_loader, desc="Batch processing"):
            source = batch['source_input_ids'].to(device)
            target = batch['target_input_ids'].to(device)

            optimizer.zero_grad()

            # Forward pass
            output  = model(source, target, teacher_forcing_ratio)
            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), target.view(-1))
            loss.backward()

            optimizer.step()

            epoch_train_loss += loss.item()

        # Average training loss for this epoch
        train_loss = epoch_train_loss / len(train_loader)
        train_losses.append(train_loss)

        # Validate step
        model.eval()
        epoch_val_loss = 0
        val_predictions = []
        val_targets = []

        with torch.no_grad():
            for batch in val_loader:
                source = batch['source_input_ids'].to(device)
                target = batch['target_input_ids'].to(device)

                # Forward pass (without teacher forcing)
                output = model(source, target, teacher_forcing_ratio=0)

                # Compute loss
                loss = criterion(output.view(-1, output.size(-1)), target.view(-1))

                epoch_val_loss += loss.item()

                # Store predictions and targets for BLEU calculation
                pred = output.argmax(dim=-1)
                val_predictions.extend(pred.cpu().numpy())
                val_targets.extend(target.cpu().numpy())

        # Average validation loss for this epoch
        val_loss = epoch_val_loss / len(val_loader)
        val_losses.append(val_loss)

        save_full_model(model, optimizer, epoch, train_loss)

        # Calculate BLEU score for the validation set
        val_bleu_score = compute_bleu(val_predictions, val_targets)
        bleu_scores.append(val_bleu_score)

        # Print results for this epoch
        print(f"Epoch {epoch + 1}/{epochs} | "
              f"Train Loss: {train_loss:.4f} | "
              f"Val Loss: {val_loss:.4f} | "
              f"Val BLEU: {val_bleu_score:.4f}")

        
        
    return train_losses, val_losses, bleu_scores

seq2seq = Seq2Seq(encoder, decoder, device).to(device)
optimizer = optim.Adam(seq2seq.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_token)


train_losses, val_losses, bleu_scores = train_model(seq2seq, train_loader, val_loader, optimizer, criterion, epochs=10, teacher_forcing_ratio=0.5, device=device)

plot_metrics(train_losses, val_losses, bleu_scores)

Batch processing: 100%|██████████| 2084/2084 [6:51:14<00:00, 11.84s/it]


Full model saved to full_seq2seq_model.pth


TypeError: unhashable type: 'list'

In [None]:
import torch

# Define the path to save your model
MODEL_PATH = "model.pth"  # You can choose any path

# Save the model's state_dict and optimizer's state_dict (if needed)
def save_model(model, optimizer, epoch, loss, model_path=MODEL_PATH):
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, model_path)
    print(f"Model saved to {model_path}")


In [None]:
def load_model(model, optimizer=None, model_path=MODEL_PATH):
    checkpoint = torch.load(model_path)

    # Load the model's state_dict
    model.load_state_dict(checkpoint['model_state_dict'])

    # If you're using an optimizer and want to resume training, load the optimizer state_dict
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # Optionally return the epoch and loss (if you want to resume training)
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    print(f"Model loaded from {model_path}")

    return model, optimizer, epoch, loss


In [None]:
# Evaluate the model on the test set after training
model.eval()
with torch.no_grad():
    test_predictions = []
    test_targets = []

    for batch in test_loader:
        source = batch['source_input_ids'].to(device)
        target = batch['target_input_ids'].to(device)

        # Forward pass
        output, _ = model(source, target, teacher_forcing_ratio=0)  # no teacher forcing for inference

        # Collect predictions and targets
        pred = output.argmax(dim=-1)
        test_predictions.extend(pred.cpu().numpy())
        test_targets.extend(target.cpu().numpy())

    # Calculate BLEU score on the test set
    test_bleu_score = compute_bleu(test_predictions, test_targets)
    print(f"Test BLEU: {test_bleu_score}")

### SentencePiece Tokenizer (Using MarianMT)

In [None]:
# from transformers import MarianTokenizer

# # Load the pre-trained SentencePiece tokenizer for English-Vietnamese
# tokenizer_sp = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")

# # Example text to tokenize
# text = "Hello, how are you doing?"

# # Tokenizing the text
# tokens_sp = tokenizer_sp.tokenize(text)
# print("SentencePiece Tokenization:", tokens_sp)

# # Converting tokens to input IDs (numerical representation)
# input_ids_sp = tokenizer_sp.encode(text)
# print("Input IDs (SentencePiece):", input_ids_sp)

# # Decoding back to the original text
# decoded_text_sp = tokenizer_sp.decode(input_ids_sp)
# print("Decoded text (SentencePiece):", decoded_text_sp)

### Byte Pair Encoding (BPE) Tokenizer (Using GPT-2)

In [None]:
# from transformers import GPT2Tokenizer

# # Load the pre-trained BPE tokenizer (GPT-2)
# tokenizer_bpe = GPT2Tokenizer.from_pretrained("gpt2")

# # Example text to tokenize
# text = "Hello, how are you doing?"

# # Tokenizing the text
# tokens_bpe = tokenizer_bpe.tokenize(text)
# print("BPE Tokenization:", tokens_bpe)

# # Converting tokens to input IDs (numerical representation)
# input_ids_bpe = tokenizer_bpe.encode(text)
# print("Input IDs (BPE):", input_ids_bpe)

# # Decoding back to the original text
# decoded_text_bpe = tokenizer_bpe.decode(input_ids_bpe)
# print("Decoded text (BPE):", decoded_text_bpe)

### WordPiece Tokenizer (Using BERT)

In [None]:
# from transformers import BertTokenizer

# # Load the pre-trained WordPiece tokenizer (BERT)
# tokenizer_wp = BertTokenizer.from_pretrained("bert-base-uncased")

# # Example text to tokenize
# text = "Hello, how are you doing?"

# # Tokenizing the text
# tokens_wp = tokenizer_wp.tokenize(text)
# print("WordPiece Tokenization:", tokens_wp)

# # Converting tokens to input IDs (numerical representation)
# input_ids_wp = tokenizer_wp.encode(text)
# print("Input IDs (WordPiece):", input_ids_wp)

# # Decoding back to the original text
# decoded_text_wp = tokenizer_wp.decode(input_ids_wp)
# print("Decoded text (WordPiece):", decoded_text_wp)