In [1]:
from datasets import load_dataset

# Load your dataset (replace with the actual name you are using)
dataset = load_dataset("ncduy/mt-en-vi")

# 1. Inspect the structure
print("Sample:", dataset["train"][0]) 
# Expected Output: {'translation': {'en': '...', 'vi': '...'}}
# If your columns are different (e.g., 'source', 'target'), you'll need to rename them.



README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/597M [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2884451 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11316 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11225 [00:00<?, ? examples/s]

Sample: {'en': "- Sorry, that question's not on here.", 'vi': '- Xin lỗi, nhưng mà ở đây không có câu hỏi đấy.', 'source': 'OpenSubtitles v2018'}


In [2]:
# 2. Basic Filtering (Optional but recommended)
# Remove sentences that are too long or too short to save memory
def is_valid_length(example):
    en_len = len(example['en'].split())
    vi_len = len(example['vi'].split())
    return 1 < en_len < 50 and 1 < vi_len < 50

cleaned_dataset = {
    split: ds.filter(is_valid_length)
    for split, ds in dataset.items()
}
cleaned_dataset["train"] = cleaned_dataset["train"].shuffle(seed=42).select(range(400000))
print(f"Original: {len(dataset)}, Cleaned: {len(cleaned_dataset)}")

Filter:   0%|          | 0/2884451 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11316 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11225 [00:00<?, ? examples/s]

Original: 3, Cleaned: 3


In [3]:
# 1. Define the function to clean the text
def clean_and_format(example):
    # This modifies the dictionary in-place
    example['en'] = example['en'].lstrip('- ').strip()
    example['vi'] = example['vi'].lstrip('- ').strip()
    return example

# 2. Apply it using .map()
# We use 'remove_columns' to delete the 'source' column at the same time
cleaned_dataset = {
    split : ds.map(clean_and_format, remove_columns=['source'])
    for split, ds in cleaned_dataset.items()
}

# Check the result
print(cleaned_dataset["train"][0])

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10940 [00:00<?, ? examples/s]

Map:   0%|          | 0/10834 [00:00<?, ? examples/s]

{'en': "I'm not sure you're that much of a patriot.", 'vi': 'Tôi không nghĩ anh là 1 người yêu nước.'}


In [4]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, decoders

print("Starting tokenizer training...")
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
print("✓ Tokenizer initialized")

# Use NFC for Vietnamese - preserves diacritics correctly
tokenizer.normalizer = normalizers.NFC()
print("✓ Normalizer configured (NFC)")

# Whitespace + punctuation split (subword-friendly)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.BPEDecoder()
print("✓ Using Whitespace pre-tokenizer & BPE decoder")

trainer = trainers.BpeTrainer(
    vocab_size=32000,
    special_tokens=["<pad>", "<unk>", "<sos>", "<eos>"]
)
print("✓ Trainer configured with vocab_size=32000")

def text_iterator():
    for ex in cleaned_dataset["train"]:
        yield ex["en"]
        yield ex["vi"]

print("\nTraining tokenizer...")
tokenizer.train_from_iterator(text_iterator(), trainer)
print("✓ Training completed")

tokenizer.save("bilingual_tokenizer.json")
print("✓ Tokenizer saved to 'bilingual_tokenizer.json'")

print("\nExtracting special token IDs...")
special_tokens = {
    "pad": tokenizer.token_to_id("<pad>"),
    "unk": tokenizer.token_to_id("<unk>"),
    "sos": tokenizer.token_to_id("<sos>"),
    "eos": tokenizer.token_to_id("<eos>")
}
print(special_tokens)

Starting tokenizer training...
✓ Tokenizer initialized
✓ Normalizer configured (NFC)
✓ Using Whitespace pre-tokenizer & BPE decoder
✓ Trainer configured with vocab_size=32000

Training tokenizer...



✓ Training completed
✓ Tokenizer saved to 'bilingual_tokenizer.json'

Extracting special token IDs...
{'pad': 0, 'unk': 1, 'sos': 2, 'eos': 3}


In [32]:
#Encode dataset with tokenizer
from tokenizers import Tokenizer
import torch

tokenizer = Tokenizer.from_file("bilingual_tokenizer.json")

pad_id = tokenizer.token_to_id("<pad>")
sos_id = tokenizer.token_to_id("<sos>")
eos_id = tokenizer.token_to_id("<eos>")

def encode(text):
    ids = tokenizer.encode(text).ids
    return ids


encoded_dataset_train = []

for ex in cleaned_dataset["train"]:
    src = encode(ex["en"])
    tgt = encode(ex["vi"])
    encoded_dataset_train.append((src, tgt))



KeyError: 'valid'

In [33]:
encoded_dataset_valid = []

for ex in cleaned_dataset["validation"]:
    src = encode(ex["en"])
    tgt = encode(ex["vi"])
    encoded_dataset_valid.append((src, tgt))

encoded_dataset_test = []

for ex in cleaned_dataset["test"]:
    src = encode(ex["en"])
    tgt = encode(ex["vi"])
    encoded_dataset_test.append((src, tgt))

In [34]:
#Build PyTorch Dataset
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, pairs, sos_id, eos_id):
        self.pairs = pairs
        self.sos_id = sos_id
        self.eos_id = eos_id

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]

        # Add <sos> and <eos>
        src = src + [self.eos_id]
        tgt_input = [self.sos_id] + tgt
        tgt_output = tgt + [self.eos_id]

        return torch.tensor(src, dtype=torch.long), \
               torch.tensor(tgt_input, dtype=torch.long), \
               torch.tensor(tgt_output, dtype=torch.long)


In [35]:
#Build collate function (batching + padding)

def collate_fn(batch):
    src_batch, tgt_in_batch, tgt_out_batch = zip(*batch)

    #pad to the same length
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first = True, padding_value = pad_id)
    tgt_in_padded = torch.nn.utils.rnn.pad_sequence(tgt_in_batch, batch_first = True, padding_value = pad_id)
    tgt_out_padded = torch.nn.utils.rnn.pad_sequence(tgt_out_batch, batch_first = True, padding_value = pad_id)

    return src_padded, tgt_in_padded, tgt_out_padded

In [36]:
print(cleaned_dataset["validation"][0])

{'en': 'In August 1764, Bertin permitted the export of grain from twenty-seven French ports, later expanded to thirty-six.', 'vi': 'Tháng 8 năm 1764, Bertin lại cho phép xuất khẩu ngũ cốc từ 27 cảng của Pháp, sau đó mở rộng lên 36.'}


In [37]:
#Create dataloader
train_dataset = TranslationDataset(encoded_dataset_train, sos_id, eos_id)
valid_dataset = TranslationDataset(encoded_dataset_valid, sos_id, eos_id)
test_dataset = TranslationDataset(encoded_dataset_test, sos_id, eos_id)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    collate_fn=collate_fn
)

In [38]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding"""
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        return x + self.pe[:, :x.size(1)]


class MultiHeadAttention(nn.Module):
    """Multi-head self-attention mechanism"""
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q, K, V: (batch_size, num_heads, seq_len, d_k)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_weights = torch.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        output = torch.matmul(attn_weights, V)
        return output
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.W_q(query)  # (batch_size, seq_len, d_model)
        K = self.W_k(key)
        V = self.W_v(value)
        
        # Split into multiple heads
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        # Apply attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.d_model)
        
        # Final linear projection
        output = self.W_o(attn_output)
        return output


class FeedForward(nn.Module):
    """Position-wise feed-forward network"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
    """Single encoder layer"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x


class DecoderLayer(nn.Module):
    """Single decoder layer"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Masked self-attention with residual connection
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Cross-attention with residual connection
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        
        return x


class Transformer(nn.Module):
    """Complete Transformer model for translation"""
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, 
                 num_heads=8, num_encoder_layers=6, num_decoder_layers=6,
                 d_ff=2048, dropout=0.1, max_len=5000, pad_idx=0):
        super().__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Embeddings
        self.src_embedding = nn.Embedding(src_vocab_size, d_model, padding_idx=pad_idx)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model, padding_idx=pad_idx)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        # Encoder and Decoder stacks
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Output projection
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        # Initialize parameters
        self._init_parameters()
    
    def _init_parameters(self):
        """Initialize parameters with Xavier uniform"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def create_src_mask(self, src):
        """Create mask for source padding"""
        # src: (batch_size, src_len)
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        # (batch_size, 1, 1, src_len)
        return src_mask
    
    def create_tgt_mask(self, tgt):
        """Create mask for target (padding + look-ahead)"""
        batch_size, tgt_len = tgt.size()
        
        # Padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)
        # (batch_size, 1, 1, tgt_len)
        
        # Look-ahead mask (lower triangular)
        tgt_sub_mask = torch.tril(torch.ones(tgt_len, tgt_len, device=tgt.device)).bool()
        # (tgt_len, tgt_len)
        
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        # (batch_size, 1, tgt_len, tgt_len)
        return tgt_mask
    
    def encode(self, src, src_mask):
        """Encode source sequence"""
        # Embedding + positional encoding
        x = self.src_embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        # Pass through encoder layers
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, enc_output, src_mask, tgt_mask):
        """Decode target sequence"""
        # Embedding + positional encoding
        x = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        # Pass through decoder layers
        for layer in self.decoder_layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        
        return x
    
    def forward(self, src, tgt):
        """Forward pass
        Args:
            src: (batch_size, src_len)
            tgt: (batch_size, tgt_len)
        Returns:
            output: (batch_size, tgt_len, tgt_vocab_size)
        """
        src_mask = self.create_src_mask(src)
        tgt_mask = self.create_tgt_mask(tgt)
        
        enc_output = self.encode(src, src_mask)
        dec_output = self.decode(tgt, enc_output, src_mask, tgt_mask)
        
        output = self.fc_out(dec_output)
        return output


# Assuming you have tokenizer loaded
vocab_size = tokenizer.get_vocab_size()

# Initialize model
model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=512,
    num_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    d_ff=2048,
    dropout=0.1,
    pad_idx=pad_id
)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(sum(p.numel() for p in model.parameters()))


93322496


In [43]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import sys
import time

def print_progress(current, total, prefix=""):
    percent = 100 * (current / total)
    bar_len = 30
    filled = int(bar_len * current // total)
    bar = "#" * filled + "-" * (bar_len - filled)
    sys.stdout.write(f"\r{prefix} [{bar}] {percent:5.1f}%")
    sys.stdout.flush()

def train_epoch(model, train_loader, optimizer, criterion, device, epoch_idx, num_epochs):
    model.train()
    total_loss = 0
    total_batches = len(train_loader)

    for i, (src, tgt_in, tgt_out) in enumerate(train_loader, 1):        
        src = src.to(device)
        tgt_input = tgt_in.to(device)
        tgt_output = tgt_out.to(device)

        optimizer.zero_grad()

        # forward
        output = model(src, tgt_input)

        # loss
        output = output.reshape(-1, output.size(-1))
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)

        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        print_progress(i, total_batches, prefix=f"Epoch {epoch_idx}/{num_epochs}")

    print()
    return total_loss / total_batches


def evaluate(model, valid_loader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, tgt_in, tgt_out in valid_loader:
            src = src.to(device)
            tgt_input = tgt_in.to(device)
            tgt_output = tgt_out.to(device)

            output = model(src, tgt_input)

            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(output, tgt_output)

            total_loss += loss.item()

    return total_loss / len(valid_loader)


# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=special_tokens['pad'])
optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

num_epochs = 15
best_val_loss = float('inf')

print("\nStarting training...\n")
for epoch in range(1, num_epochs + 1):
    start_time = time.time()
    
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device, epoch, num_epochs)
    val_loss = evaluate(model, valid_loader, criterion, device)
    
    scheduler.step(val_loss)
    epoch_time = time.time() - start_time
    
    print(f"Epoch {epoch}/{num_epochs} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Time: {epoch_time:.2f}s")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
        }, 'best_model.pt')
        print(f"✓ Saved best model (val_loss: {val_loss:.4f})\n")

print("\nTraining completed!")



Starting training...



KeyboardInterrupt: 

In [None]:
def translate(model, sentence, max_len=60):
    model.eval()

    src = torch.tensor(encode(sentence)).unsqueeze(0).to(device)
    tgt = torch.tensor([[sos_id]], dtype=torch.long).to(device)

    for _ in range(max_len):
        logits = model(src, tgt)
        next_id = logits[:, -1].argmax(dim=-1).unsqueeze(1)
        tgt = torch.cat([tgt, next_id], dim=1)
        if next_id.item() == eos_id:
            break
    
    tokens = tgt[0].tolist()[1:-1]   # remove sos & eos
    return tokenizer.decode(tokens)


In [None]:
print(translate(model, "I love machine learning."))
