In [3]:
# Install missing packages (run this once)
!pip install -q sacrebleu datasets evaluate
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import json
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
from sacrebleu.metrics import BLEU
import nltk
nltk.download('punkt')

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from datasets import load_dataset

# ডেটাসেট লোড
dataset = load_dataset("Nan-Do/code-search-net-python")

print("Available splits:", list(dataset.keys()))          # দেখাবে: ['train']
print("Total examples:", len(dataset['train']))           # ~455k

# শুধু train split আছে → আমরা নিজেরাই split করব
full_train_dataset = dataset['train']

# Manual split (80% train, 10% valid, 10% test)
# shuffle + select করা ভালো
shuffled = full_train_dataset.shuffle(seed=42)

train_size = 8000
valid_size = 1000
test_size  = 1000

train_data   = shuffled.select(range(0, train_size))
valid_data   = shuffled.select(range(train_size, train_size + valid_size))
test_data    = shuffled.select(range(train_size + valid_size, train_size + valid_size + test_size))

print(f"Train: {len(train_data)} examples")
print(f"Valid: {len(valid_data)} examples")
print(f"Test:  {len(test_data)} examples")

Available splits: ['train']
Total examples: 455243
Train: 8000 examples
Valid: 1000 examples
Test:  1000 examples


In [10]:
def clean_text(text):
    if not text: return ""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def is_valid_example(example):
    doc = clean_text(example['docstring'])
    code = clean_text(example['code'])
    if not doc or not code:
        return False
    doc_tokens = len(doc.split())
    code_tokens = len(code.split())
    return 5 <= doc_tokens <= 50 and 10 <= code_tokens <= 80

# Filtering
filtered_train = [ex for ex in tqdm(train_data) if is_valid_example(ex)]
filtered_valid = [ex for ex in tqdm(valid_data) if is_valid_example(ex)]
filtered_test  = [ex for ex in tqdm(test_data)  if is_valid_example(ex)]

print(f"After filtering → Train: {len(filtered_train)}")
print(f"After filtering → Valid: {len(filtered_valid)}")
print(f"After filtering → Test : {len(filtered_test)}")

100%|██████████| 8000/8000 [00:02<00:00, 2887.72it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2913.17it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2967.23it/s]

After filtering → Train: 4226
After filtering → Valid: 532
After filtering → Test : 526





In [11]:
from collections import Counter
import re

# সিম্পল টোকেনাইজার (whitespace + কিছু ক্লিনিং)
def simple_tokenize(text):
    text = text.lower().strip()
    # code এর জন্য কিছু basic cleaning (optional, পরে improve করতে পারো)
    text = re.sub(r'([.,;:!?()])', r' \1 ', text)   # punctuation আলাদা করা
    text = re.sub(r'\s+', ' ', text)
    return text.split()

# সব docstring ও code থেকে tokens কালেক্ট করা
all_source_tokens = []
all_target_tokens = []

for ex in tqdm(filtered_train + filtered_valid + filtered_test, desc="Collecting tokens"):
    doc_tokens = simple_tokenize(ex['docstring'])
    code_tokens = simple_tokenize(ex['code'])
    
    all_source_tokens.extend(doc_tokens)
    all_target_tokens.extend(code_tokens)

# Vocabulary size নির্ধারণ (সবচেয়ে common 10k-15k words/code tokens)
source_counter = Counter(all_source_tokens)
target_counter = Counter(all_target_tokens)

# Top words নেওয়া (যেগুলো <5 বার আসে সেগুলো <unk> হবে)
MIN_FREQ = 2   # তুমি 1 বা 3 করতে পারো

source_vocab = ['<pad>', '<sos>', '<eos>', '<unk>']
target_vocab = ['<pad>', '<sos>', '<eos>', '<unk>']

for word, freq in source_counter.most_common():
    if freq >= MIN_FREQ:
        source_vocab.append(word)

for word, freq in target_counter.most_common():
    if freq >= MIN_FREQ:
        target_vocab.append(word)

print(f"Source vocab size (docstring): {len(source_vocab)}")
print(f"Target vocab size (code):     {len(target_vocab)}")

# Mapping তৈরি
source_word2idx = {w: i for i, w in enumerate(source_vocab)}
source_idx2word = {i: w for w, i in source_word2idx.items()}

target_word2idx = {w: i for i, w in enumerate(target_vocab)}
target_idx2word = {i: w for w, i in target_word2idx.items()}

# প্যাডিং index
PAD_IDX = source_word2idx['<pad>']   # source আর target এ একই <pad> index
SOS_IDX = source_word2idx['<sos>']
EOS_IDX = source_word2idx['<eos>']
UNK_IDX = source_word2idx['<unk>']

print("Special token indices:", {"<pad>": PAD_IDX, "<sos>": SOS_IDX, "<eos>": EOS_IDX, "<unk>": UNK_IDX})

Collecting tokens: 100%|██████████| 5284/5284 [00:00<00:00, 14244.85it/s]


Source vocab size (docstring): 4878
Target vocab size (code):     16362
Special token indices: {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}


In [12]:
# Function to convert text to list of indices using our vocabulary
def text_to_indices(text, word2idx, max_len, is_source=True):
    """
    Convert a text string to a list of token indices.
    - Adds <sos> at start and <eos> at end
    - Pads or truncates to max_len
    - Unknown words become <unk>
    """
    tokens = simple_tokenize(text)  # Reuse the same tokenizer we used for vocab
    indices = [word2idx.get(token, UNK_IDX) for token in tokens]  # <unk> for OOV
    
    # Truncate if too long
    if len(indices) > max_len - 2:  # -2 for <sos> and <eos>
        indices = indices[:max_len - 2]
    
    # Add <sos> and <eos>
    indices = [SOS_IDX] + indices + [EOS_IDX]
    
    # Pad to max_len
    padded = indices + [PAD_IDX] * (max_len - len(indices))
    
    return padded

# Maximum lengths as per assignment
MAX_DOC_LEN = 50   # docstring
MAX_CODE_LEN = 80  # code

# Custom Dataset class for PyTorch
class CodeGenDataset(Dataset):
    """
    PyTorch Dataset class for our docstring → code pairs.
    Returns source (docstring) and target (code) as tensors.
    """
    def __init__(self, data_list):
        self.data = data_list
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data[idx]
        
        # Convert docstring (source)
        src = text_to_indices(example['docstring'], source_word2idx, MAX_DOC_LEN, is_source=True)
        
        # Convert code (target)
        tgt = text_to_indices(example['code'], target_word2idx, MAX_CODE_LEN, is_source=False)
        
        return {
            'src': torch.tensor(src, dtype=torch.long),
            'tgt': torch.tensor(tgt, dtype=torch.long)
        }

# Create datasets
train_dataset = CodeGenDataset(filtered_train)
valid_dataset = CodeGenDataset(filtered_valid)
test_dataset  = CodeGenDataset(filtered_test)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Valid dataset size: {len(valid_dataset)}")
print(f"Test  dataset size: {len(test_dataset)}")

# Example: Check first sample
sample = train_dataset[0]
print("\nSample source (docstring indices):", sample['src'].tolist())
print("Sample target (code indices):    ", sample['tgt'].tolist())
print("Source length:", len(sample['src']))
print("Target length:", len(sample['tgt']))

Train dataset size: 4226
Valid dataset size: 532
Test  dataset size: 526

Sample source (docstring indices): [1, 681, 7, 444, 1961, 5, 4, 13, 34, 3, 4, 6, 431, 9, 6, 444, 1961, 5, 4, 13, 57, 205, 4, 6, 71, 205, 14, 6, 71, 5, 4, 15, 4, 6, 891, 444, 1961, 5, 4, 58, 4, 57, 2, 0, 0, 0, 0, 0, 0, 0]
Sample target (code indices):     [1, 15, 3, 6, 9, 8, 4591, 8, 3749, 5, 7, 3169, 17, 855, 4592, 4, 7, 28, 43, 4591, 7, 11, 500, 19, 11, 855, 4592, 4, 7, 28, 79, 256, 7, 11, 76, 256, 18, 11, 76, 4, 7, 13, 7, 11, 1013, 855, 4592, 4, 7, 132, 7, 79, 12, 14, 256, 21, 24, 7, 256, 10, 93, 31, 10, 313, 4, 6150, 6, 4591, 5, 9068, 10, 313, 4, 6151, 6, 4591, 5, 13, 9, 2]
Source length: 50
Target length: 80


In [13]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    Custom collate function to pad sequences in a batch dynamically.
    Returns padded src, tgt, and their lengths.
    """
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]
    
    # Pad sequences (already padded to max, but we keep it simple)
    # Since we already padded to fixed MAX in __getitem__, no extra pad needed
    # But we can compute lengths if needed later
    src_lengths = torch.tensor([torch.sum(item['src'] != PAD_IDX).item() for item in batch])
    tgt_lengths = torch.tensor([torch.sum(item['tgt'] != PAD_IDX).item() for item in batch])
    
    src = torch.stack(src_batch)   # [batch_size, MAX_DOC_LEN]
    tgt = torch.stack(tgt_batch)   # [batch_size, MAX_CODE_LEN]
    
    return {
        'src': src,
        'tgt': tgt,
        'src_len': src_lengths,
        'tgt_len': tgt_lengths
    }

# Create DataLoaders
BATCH_SIZE = 32   # GPU memory অনুযায়ী 16/32/64 করতে পারো

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2   # Kaggle-এ 2-4 ভালো
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)

print(f"Number of train batches: {len(train_loader)}")
print(f"Number of valid batches: {len(valid_loader)}")
print(f"Number of test batches : {len(test_loader)}")

# Check one batch shape
batch = next(iter(train_loader))
print("\nBatch shapes:")
print("src:", batch['src'].shape)      # [batch_size, 50]
print("tgt:", batch['tgt'].shape)      # [batch_size, 80]
print("src_len example:", batch['src_len'][:5])

Number of train batches: 133
Number of valid batches: 17
Number of test batches : 17

Batch shapes:
src: torch.Size([32, 50])
tgt: torch.Size([32, 80])
src_len example: tensor([32,  8, 50, 30, 14])


In [15]:
import torch.nn as nn

class EncoderRNN(nn.Module):
    """
    Vanilla RNN Encoder
    - Takes source sequence (docstring)
    - Returns final hidden state as context vector
    """
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1, dropout=0.0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        # src = [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))          # [batch_size, src_len, emb_dim]
        outputs, hidden = self.rnn(embedded)                   # hidden = [n_layers, batch_size, hid_dim]
        return hidden  # We only need the last hidden state


class DecoderRNN(nn.Module):
    """
    Vanilla RNN Decoder
    - Takes context vector + previous token
    - Predicts next token
    """
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=1, dropout=0.0):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden):
        # input = [batch_size]  → single token index
        # hidden = [n_layers, batch_size, hid_dim]
        input = input.unsqueeze(1)                                 # [batch_size, 1]
        embedded = self.dropout(self.embedding(input))            # [batch_size, 1, emb_dim]
        output, hidden = self.rnn(embedded, hidden)                # output: [batch_size, 1, hid_dim]
        prediction = self.fc_out(output.squeeze(1))               # [batch_size, output_dim]
        return prediction, hidden


class Seq2SeqRNN(nn.Module):
    """
    Full Vanilla RNN Seq2Seq Model
    """
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [batch_size, src_len]
        # trg = [batch_size, trg_len]
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = len(target_vocab)
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Encoder forward
        hidden = self.encoder(src)   # [1, batch_size, hid_dim]  (n_layers=1)
        
        # First input to decoder is <sos>
        input = trg[:, 0]   # [batch_size]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = output
            
            # Decide if we use teacher forcing
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)   # [batch_size]
            
            input = trg[:, t] if teacher_force else top1
        
        return outputs

In [16]:
# Common hyperparameters for all models
EMB_DIM = 256
HID_DIM = 256
N_LAYERS = 1
DROPOUT = 0.1
DEVICE = device

# Initialize Vanilla RNN model
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)

enc_rnn = EncoderRNN(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec_rnn = DecoderRNN(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)

model_rnn = Seq2SeqRNN(enc_rnn, dec_rnn, DEVICE).to(DEVICE)

print(model_rnn)
print(f"Total parameters: {sum(p.numel() for p in model_rnn.parameters() if p.requires_grad):,}")

Seq2SeqRNN(
  (encoder): EncoderRNN(
    (embedding): Embedding(4878, 256, padding_idx=0)
    (rnn): RNN(256, 256, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(16362, 256, padding_idx=0)
    (rnn): RNN(256, 256, batch_first=True)
    (fc_out): Linear(in_features=256, out_features=16362, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
Total parameters: 9,905,642


In [17]:
import torch.optim as optim
from tqdm import tqdm
import time
import math

# Loss function: Cross-entropy, ignore padding tokens
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Optimizer: Adam (assignment-এ বলা আছে)
optimizer = optim.Adam(model_rnn.parameters(), lr=0.001)

# Function to calculate time taken
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Training function (one epoch)
def train(model, iterator, optimizer, criterion, clip=1):
    model.train()
    
    epoch_loss = 0
    
    for batch in tqdm(iterator, desc="Training"):
        src = batch['src'].to(DEVICE)
        trg = batch['tgt'].to(DEVICE)
        
        optimizer.zero_grad()
        
        output = model(src, trg)           # [batch_size, trg_len, output_dim]
        
        # Reshape for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)   # remove <sos>
        trg = trg[:, 1:].reshape(-1)                      # remove <sos>
        
        loss = criterion(output, trg)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)


# Evaluation function (validation / test)
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(iterator, desc="Evaluating"):
            src = batch['src'].to(DEVICE)
            trg = batch['tgt'].to(DEVICE)
            
            output = model(src, trg, teacher_forcing_ratio=0.0)  # no teacher forcing
            
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

In [19]:
N_EPOCHS = 8
CLIP = 1

best_valid_loss = float('inf')

print("Starting Vanilla RNN training...\n")

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model_rnn, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model_rnn, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_rnn.state_dict(), 'vanilla_rnn_best.pt')
        print("** Saved best model **")
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Starting Vanilla RNN training...



Training: 100%|██████████| 133/133 [00:46<00:00,  2.88it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.60it/s]


** Saved best model **
Epoch: 01 | Time: 0m 48s
	Train Loss: 4.612 | Train PPL: 100.667
	 Val. Loss: 5.835 |  Val. PPL: 342.025


Training: 100%|██████████| 133/133 [00:46<00:00,  2.87it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.47it/s]


Epoch: 02 | Time: 0m 49s
	Train Loss: 4.554 | Train PPL:  95.052
	 Val. Loss: 5.853 |  Val. PPL: 348.153


Training: 100%|██████████| 133/133 [00:46<00:00,  2.85it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.43it/s]


Epoch: 03 | Time: 0m 49s
	Train Loss: 4.526 | Train PPL:  92.404
	 Val. Loss: 5.877 |  Val. PPL: 356.717


Training: 100%|██████████| 133/133 [00:46<00:00,  2.88it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.37it/s]


** Saved best model **
Epoch: 04 | Time: 0m 48s
	Train Loss: 4.490 | Train PPL:  89.143
	 Val. Loss: 5.776 |  Val. PPL: 322.601


Training: 100%|██████████| 133/133 [00:46<00:00,  2.88it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.50it/s]


Epoch: 05 | Time: 0m 48s
	Train Loss: 4.443 | Train PPL:  85.036
	 Val. Loss: 5.854 |  Val. PPL: 348.713


Training: 100%|██████████| 133/133 [00:46<00:00,  2.84it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.54it/s]


Epoch: 06 | Time: 0m 49s
	Train Loss: 4.415 | Train PPL:  82.702
	 Val. Loss: 5.824 |  Val. PPL: 338.309


Training: 100%|██████████| 133/133 [00:46<00:00,  2.87it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.59it/s]


Epoch: 07 | Time: 0m 48s
	Train Loss: 4.395 | Train PPL:  81.006
	 Val. Loss: 5.929 |  Val. PPL: 375.932


Training: 100%|██████████| 133/133 [00:46<00:00,  2.87it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.54it/s]

Epoch: 08 | Time: 0m 48s
	Train Loss: 4.362 | Train PPL:  78.416
	 Val. Loss: 5.989 |  Val. PPL: 399.024





In [20]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        # hidden & cell: [n_layers, batch_size, hid_dim]
        return hidden, cell


class DecoderLSTM(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell


class Seq2SeqLSTM(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = len(target_vocab)
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)  # both [1, batch_size, hid_dim]
        
        input = trg[:, 0]  # <sos>
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[:, t] if teacher_force else top1
        
        return outputs

In [21]:
# LSTM model
enc_lstm = EncoderLSTM(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec_lstm = DecoderLSTM(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)

model_lstm = Seq2SeqLSTM(enc_lstm, dec_lstm, DEVICE).to(DEVICE)

print(model_lstm)
print(f"LSTM Total parameters: {sum(p.numel() for p in model_lstm.parameters() if p.requires_grad):,}")

Seq2SeqLSTM(
  (encoder): EncoderLSTM(
    (embedding): Embedding(4878, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderLSTM(
    (embedding): Embedding(16362, 256, padding_idx=0)
    (lstm): LSTM(256, 256, batch_first=True)
    (fc_out): Linear(in_features=256, out_features=16362, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
LSTM Total parameters: 10,695,146


In [22]:
# LSTM model এর জন্য নতুন optimizer ও loss (পুরানোটা RNN-এর জন্য ছিল)
criterion_lstm = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.001)

print("Starting LSTM Seq2Seq training...\n")

N_EPOCHS_LSTM = 10          # LSTM-এর জন্য একটু বেশি epoch দিচ্ছি
best_valid_loss_lstm = float('inf')

for epoch in range(N_EPOCHS_LSTM):
    start_time = time.time()
    
    train_loss = train(model_lstm, train_loader, optimizer_lstm, criterion_lstm, CLIP)
    valid_loss = evaluate(model_lstm, valid_loader, criterion_lstm)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss_lstm:
        best_valid_loss_lstm = valid_loss
        torch.save(model_lstm.state_dict(), 'lstm_best.pt')
        print("** Saved best LSTM model **")
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}\n')

Starting LSTM Seq2Seq training...



Training: 100%|██████████| 133/133 [00:46<00:00,  2.83it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.40it/s]


** Saved best LSTM model **
Epoch: 01 | Time: 0m 49s
	Train Loss: 5.857 | Train PPL: 349.768
	 Val. Loss: 5.599 |  Val. PPL: 270.125



Training: 100%|██████████| 133/133 [00:46<00:00,  2.84it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.46it/s]


** Saved best LSTM model **
Epoch: 02 | Time: 0m 49s
	Train Loss: 5.059 | Train PPL: 157.507
	 Val. Loss: 5.563 |  Val. PPL: 260.538



Training: 100%|██████████| 133/133 [00:47<00:00,  2.83it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.29it/s]


Epoch: 03 | Time: 0m 49s
	Train Loss: 4.873 | Train PPL: 130.670
	 Val. Loss: 5.584 |  Val. PPL: 266.126



Training: 100%|██████████| 133/133 [00:46<00:00,  2.83it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.66it/s]


Epoch: 04 | Time: 0m 49s
	Train Loss: 4.761 | Train PPL: 116.824
	 Val. Loss: 5.604 |  Val. PPL: 271.439



Training: 100%|██████████| 133/133 [00:46<00:00,  2.86it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.56it/s]


Epoch: 05 | Time: 0m 49s
	Train Loss: 4.679 | Train PPL: 107.651
	 Val. Loss: 5.576 |  Val. PPL: 264.054



Training: 100%|██████████| 133/133 [00:46<00:00,  2.84it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.22it/s]


Epoch: 06 | Time: 0m 49s
	Train Loss: 4.603 | Train PPL:  99.800
	 Val. Loss: 5.656 |  Val. PPL: 285.959



Training: 100%|██████████| 133/133 [00:47<00:00,  2.82it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.27it/s]


Epoch: 07 | Time: 0m 49s
	Train Loss: 4.542 | Train PPL:  93.838
	 Val. Loss: 5.659 |  Val. PPL: 286.912



Training: 100%|██████████| 133/133 [00:47<00:00,  2.82it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.29it/s]


Epoch: 08 | Time: 0m 49s
	Train Loss: 4.507 | Train PPL:  90.628
	 Val. Loss: 5.813 |  Val. PPL: 334.513



Training: 100%|██████████| 133/133 [00:47<00:00,  2.81it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.25it/s]


Epoch: 09 | Time: 0m 49s
	Train Loss: 4.460 | Train PPL:  86.519
	 Val. Loss: 5.712 |  Val. PPL: 302.536



Training: 100%|██████████| 133/133 [00:47<00:00,  2.82it/s]
Evaluating: 100%|██████████| 17/17 [00:02<00:00,  6.38it/s]

Epoch: 10 | Time: 0m 49s
	Train Loss: 4.433 | Train PPL:  84.185
	 Val. Loss: 5.581 |  Val. PPL: 265.268






In [66]:
class BahdanauAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.Wa = nn.Linear(dec_hid_dim, enc_hid_dim)           # query → enc_hid_dim space
        self.Ua = nn.Linear(enc_hid_dim, enc_hid_dim)           # keys → enc_hid_dim space
        self.Va = nn.Linear(enc_hid_dim, 1)
    
    def forward(self, query, keys):
        # query:   [batch_size, dec_hid_dim]
        # keys:    [batch_size, src_len, enc_hid_dim]
        query_proj = self.Wa(query).unsqueeze(1)               # [batch, 1, enc_hid_dim]
        keys_proj  = self.Ua(keys)                             # [batch, src_len, enc_hid_dim]
        
        energy = torch.tanh(query_proj + keys_proj)            # [batch, src_len, enc_hid_dim]
        attention_scores = self.Va(energy).squeeze(2)          # [batch, src_len]
        
        return F.softmax(attention_scores, dim=1)              # [batch, src_len]

In [67]:
class EncoderBiLSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(
            emb_dim,
            hid_dim,
            num_layers=n_layers,
            bidirectional=True,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout)
        
        # Projection layers: bidirectional hidden → decoder hidden size
        self.hidden_projection = nn.Linear(hid_dim * 2, hid_dim)
        self.cell_projection   = nn.Linear(hid_dim * 2, hid_dim)
    
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)

        # hidden, cell: [2, batch, hid_dim]  (bidirectional)
        # Concat forward + backward
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)   # [batch, hid*2]
        cell   = torch.cat((cell[-2,:,:],   cell[-1,:,:]),   dim=1)

        # Project to decoder hidden size
        hidden = self.hidden_projection(hidden)   # [batch, hid_dim]
        cell   = self.cell_projection(cell)       # [batch, hid_dim]

        # MOST IMPORTANT: add num_layers dimension (1)
        hidden = hidden.unsqueeze(0)              # [1, batch, hid_dim]
        cell   = cell.unsqueeze(0)                # [1, batch, hid_dim]

        return outputs, hidden, cell

In [68]:
class DecoderAttentionLSTM(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, enc_hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.attention = BahdanauAttention(enc_hid_dim, hid_dim)
        
        # LSTM input dim = embedding + context (enc_hid_dim)
        self.lstm = nn.LSTM(
            emb_dim + enc_hid_dim,
            hid_dim,
            num_layers=1,               # fixed to 1
            batch_first=True,
            dropout=0                   # n_layers=1 বলে dropout 0
        )
        
        self.fc_out = nn.Linear(emb_dim + hid_dim + enc_hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell, encoder_outputs):
        # input: [batch_size]
        input = input.unsqueeze(1)                             # [batch, 1]
        embedded = self.dropout(self.embedding(input))         # [batch, 1, emb]
        
        # Attention
        attn_weights = self.attention(hidden, encoder_outputs)  # [batch, src_len]
        attn_weights = attn_weights.unsqueeze(1)                # [batch, 1, src_len]
        
        context = torch.bmm(attn_weights, encoder_outputs)      # [batch, 1, enc_hid_dim]
        
        # LSTM input
        lstm_input = torch.cat((embedded, context), dim=2)     # [batch, 1, emb + enc_hid]
        
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        
        # Prediction
        embedded = embedded.squeeze(1)
        output   = output.squeeze(1)
        context  = context.squeeze(1)
        
        pred_input = torch.cat((embedded, output, context), dim=1)
        prediction = self.fc_out(pred_input)
        
        return prediction, hidden, cell, attn_weights.squeeze(1)

In [75]:
class Seq2SeqAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = len(target_vocab)
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        attentions = torch.zeros(batch_size, trg_len, src.shape[1]).to(self.device)
        
        enc_outputs, hidden, cell = self.encoder(src)
        
        input = trg[:, 0]  # <sos>
        
        for t in range(1, trg_len):
            output, hidden, cell, attn = self.decoder(input, hidden, cell, enc_outputs)
            outputs[:, t, :] = output
            attentions[:, t, :] = attn
            
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[:, t] if teacher_force else top1
        
        return outputs, attentions

In [78]:
enc_attn = EncoderBiLSTM(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT)
dec_attn = DecoderAttentionLSTM(OUTPUT_DIM, EMB_DIM, HID_DIM, HID_DIM*2, DROPOUT)

model_attn = Seq2SeqAttention(enc_attn, dec_attn, DEVICE).to(DEVICE)

optimizer_attn = optim.Adam(model_attn.parameters(), lr=0.0005)
print(f"Attention Model parameters: {sum(p.numel() for p in model_attn.parameters() if p.requires_grad):,}")

Attention Model parameters: 24,969,195


In [79]:
# যদি memory error না আসে তাহলে BATCH_SIZE = 16 বা 24 রাখো
# যদি আসে তাহলে 12 বা 8 করে দাও

N_EPOCHS_ATTN = 10   # প্রথমে ১০ দিয়ে দেখো, পরে বাড়াতে পারো

best_valid_loss_attn = float('inf')

print("Starting Attention Model training...\n")

for epoch in range(N_EPOCHS_ATTN):
    start_time = time.time()
    
    train_loss = train_attention(model_attn, train_loader, optimizer_attn, criterion_attn, clip=1)
    valid_loss, sample_attns = evaluate_attention(model_attn, valid_loader, criterion_attn)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss_attn:
        best_valid_loss_attn = valid_loss
        torch.save(model_attn.state_dict(), 'attention_best.pt')
        print("** Saved best Attention model **")
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}\n')

Starting Attention Model training...



Training Attention:   0%|          | 0/133 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (32) must match the size of tensor b (50) at non-singleton dimension 2