In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import os
from nltk.translate.bleu_score import sentence_bleu
from torchmetrics.text import BLEUScore
from lion_pytorch import Lion
from torch.cuda.amp import autocast, GradScaler

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Device: ", device)
torch.cuda.empty_cache()
# Paths to data files
EN_FILE = "D:/JapaneseToEnglishDataset/CCMatrix/CCMatrix.en-ja.en"
JA_FILE = "D:/JapaneseToEnglishDataset/CCMatrix/CCMatrix.en-ja.ja"

  from .autonotebook import tqdm as notebook_tqdm


Device:  cuda


In [2]:
# Define gradient accumulation steps
GRADIENT_ACCUMULATION_STEPS = 4
scaler = GradScaler()


# Step 1: Tokenization and Vocabulary
# Use Hugging Face tokenizer
tokenizer_src = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer_tgt = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

en_vocab_size = tokenizer_src.vocab_size
ja_vocab_size = tokenizer_tgt.vocab_size

# Special tokens
PAD_IDX = tokenizer_src.pad_token_id
BOS_IDX = tokenizer_src.bos_token_id or tokenizer_src.cls_token_id
EOS_IDX = tokenizer_src.eos_token_id or tokenizer_src.sep_token_id

# Step 2: Dataset Definition
class TranslationDataset(Dataset):
    def __init__(self, src_file, tgt_file, src_tokenizer, tgt_tokenizer):
        with open(src_file, encoding="utf-8") as f:
            self.src_data = f.readlines()
        with open(tgt_file, encoding="utf-8") as f:
            self.tgt_data = f.readlines()
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src = self.src_tokenizer(self.src_data[idx].strip(), return_tensors='pt', padding=True, truncation=True).input_ids[0]
        tgt = self.tgt_tokenizer(self.tgt_data[idx].strip(), return_tensors='pt', padding=True, truncation=True).input_ids[0]
        return torch.tensor([BOS_IDX] + src.tolist() + [EOS_IDX]), torch.tensor([BOS_IDX] + tgt.tolist() + [EOS_IDX])

# Create dataset and dataloader
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src, tgt in batch:
        src_batch.append(src)
        tgt_batch.append(tgt)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=PAD_IDX)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_IDX)
    return src_batch, tgt_batch


data = TranslationDataset(EN_FILE, JA_FILE, tokenizer_src, tokenizer_tgt)
dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Step 3: Define Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size=512, num_heads=8, num_layers=6, ffn_hidden=2048):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, embed_size, padding_idx=PAD_IDX)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, embed_size, padding_idx=PAD_IDX)
        self.pos_encoder = nn.Transformer(
            d_model=embed_size, 
            nhead=num_heads, 
            num_encoder_layers=num_layers, 
            num_decoder_layers=num_layers, 
            dim_feedforward=ffn_hidden, 
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, tgt_vocab_size)

    def forward(self, src, tgt):
        src = self.src_embed(src)  # [batch_size, src_len, embed_size]
        tgt = self.tgt_embed(tgt)  # [batch_size, tgt_len, embed_size]
        output = self.pos_encoder(src, tgt)
        output = self.fc_out(output)
        return output


# Step 5: Training Loop
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for batch_idx, (src, tgt) in enumerate(dataloader):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]  # Remove the last token for input
        tgt_output = tgt[:, 1:]  # Shift the target to the right

        # Mixed precision training
        with autocast():
            output = model(src, tgt_input)  # Forward pass
            loss = criterion(output.view(-1, output.shape[-1]), tgt_output.contiguous().view(-1))
            loss = loss / GRADIENT_ACCUMULATION_STEPS  # Normalize loss for accumulation

        # Backward pass with scaling
        scaler.scale(loss).backward()

        # Gradient accumulation: Step optimizer every N steps
        if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0 or (batch_idx + 1) == len(dataloader):
            scaler.step(optimizer)  # Update weights
            scaler.update()        # Adjust scaling factor
            optimizer.zero_grad()  # Reset gradients

        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS  # Un-normalize loss for logging

    return total_loss / len(dataloader)


  scaler = GradScaler()


In [3]:
model = Transformer(en_vocab_size, ja_vocab_size).to(device)
# Step 4: Training Setup
optimizer = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [None]:
EPOCHS = 10
for epoch in range(1, EPOCHS + 1):
    train_loss = train_epoch(model, dataloader, optimizer, criterion, device)
    print(f"Epoch {epoch}, Loss: {train_loss:.4f}")

# Save the model
torch.save(model.state_dict(), "transformer_translation.pt")
print("Model saved!")


  with autocast():
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
