In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/btl-nlp-cleandata/test_cleaned.jsonl
/kaggle/input/btl-nlp-cleandata/train_cleaned.jsonl
/kaggle/input/btl-nlp-cleandata/__results__.html
/kaggle/input/btl-nlp-cleandata/__notebook__.ipynb
/kaggle/input/btl-nlp-cleandata/__output__.json
/kaggle/input/btl-nlp-cleandata/custom.css
/kaggle/input/btl-nlp-cleandata/.virtual_documents/__notebook_source__.ipynb
/kaggle/input/fork-of-btl-nlp-cleandata/test_cleaned.jsonl
/kaggle/input/fork-of-btl-nlp-cleandata/train_cleaned.jsonl
/kaggle/input/fork-of-btl-nlp-cleandata/__results__.html
/kaggle/input/fork-of-btl-nlp-cleandata/__huggingface_repos__.json
/kaggle/input/fork-of-btl-nlp-cleandata/__notebook__.ipynb
/kaggle/input/fork-of-btl-nlp-cleandata/valid_cleaned.jsonl
/kaggle/input/fork-of-btl-nlp-cleandata/__output__.json
/kaggle/input/fork-of-btl-nlp-cleandata/custom.css


In [2]:
from datasets import load_dataset
from datasets import DatasetDict


# Load data
data_files = {
    "train": "/kaggle/input/fork-of-btl-nlp-cleandata/train_cleaned.jsonl",
    "val":"/kaggle/input/fork-of-btl-nlp-cleandata/valid_cleaned.jsonl",
    "test": "/kaggle/input/fork-of-btl-nlp-cleandata/test_cleaned.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

# Reassemble into final dataset structure
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': dataset['val'],  # Note: this is validation, not test
    'test': dataset['test']
})

print("="*60)
print("Final dataset structure:")
print(dataset)
print(f"  Train: {len(dataset['train'])} pairs")
print(f"  Validation: {len(dataset['validation'])} pairs")
print(f"  Test: {len(dataset['test'])} pairs")
print("="*60)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Final dataset structure:
DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 959482
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 11004
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 10917
    })
})
  Train: 959482 pairs
  Validation: 11004 pairs
  Test: 10917 pairs


In [3]:
import random

def inspect_dataset(dataset, num_samples=5):
    """Inspect random samples from dataset"""
    print("\n" + "="*60)
    print("DATASET INSPECTION")
    print("="*60)
    
    # Check for None/empty values
    for split in dataset.keys():
        print(f"\n{split.upper()} split:")
        print(f"  Total examples: {len(dataset[split])}")
        
        # Sample random examples
        indices = random.sample(range(len(dataset[split])), min(num_samples, len(dataset[split])))
        
        for idx in indices:
            example = dataset[split][idx]
            en = example['en']
            vi = example['vi']
            
            print(f"\n  Example {idx}:")
            print(f"    EN ({len(en.split())} words): {en[:100]}...")
            print(f"    VI ({len(vi.split())} words): {vi[:100]}...")
            
            # Sanity checks
            if not en or not vi:
                print("    ⚠️  WARNING: Empty field detected!")
            if len(en.split()) < 3 or len(vi.split()) < 3:
                print("    ⚠️  WARNING: Very short sentence!")

inspect_dataset(dataset)


DATASET INSPECTION

TRAIN split:
  Total examples: 959482

  Example 304084:
    EN (15 words): In 1569, Pope Pius V elevated him to the rank of Grand Duke of Tuscany....
    VI (17 words): Năm 1569, Qiáo Hoàng Pius V đã nâng ông lên vị trí Đại Công tước xứ Toscana....

  Example 101611:
    EN (11 words): No body's been found in the harbor or anywhere near it....
    VI (17 words): Không có cái xác nào được tìm thấy trong cảng cũng như khu vực quanh đó cả....

  Example 784670:
    EN (10 words): It appears you've been beaten by the ace of clubs....
    VI (11 words): Có vẻ cô đã bị đánh bại... bởi con Xì Chuồn....

  Example 468624:
    EN (6 words): Since when did you use formalities?...
    VI (9 words): Ngươi bắt đầu khách sáo từ khi nào vậy?...

  Example 162589:
    EN (17 words): You heard him say it. He's killed my father and now you will let him kill me....
    VI (19 words): Ông nghe hắn nói rồi đấy, hắn đã giết cha tôi, và giờ ông để cho hắn giết tôi....

VALIDATION split:

In [4]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, processors
from transformers import PreTrainedTokenizerFast
from datasets import DatasetDict
from tokenizers import decoders

def train_medical_tokenizer(
    dataset: DatasetDict,
    vocab_size: int = 40000,
    save_path: str = "tokenizer_medical_envi"
):
    """
    Train a robust byte-level BPE tokenizer optimized for EN-VI medical translation.
    """

    print("=" * 70)
    print("        TRAINING EN–VI MEDICAL BYTE-LEVEL TOKENIZER")
    print("=" * 70)

    # --------------------------------------------------------
    # 1. Build corpus generator
    # --------------------------------------------------------
    print("\n1. Preparing training corpus...")

    train_iter = dataset["train"].to_iterable_dataset()

    def corpus_generator():
        for ex in train_iter:
            if "en" in ex and ex["en"]:
                yield ex["en"]
            if "vi" in ex and ex["vi"]:
                yield ex["vi"]

    # --------------------------------------------------------
    # 2. Create tokenizer
    # --------------------------------------------------------
    print("2. Initializing byte-level BPE tokenizer...")

    tokenizer = Tokenizer(
    models.BPE(unk_token="<unk>", byte_fallback=True)
    )
    # --- normalization pipeline ---
    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),       # fully decompose (Vietnamese safe)
        normalizers.Lowercase(), # lowercase everything
        normalizers.NFC(),       # recompose
        normalizers.Strip(),     # remove leading/trailing whitespace
    ])

    # --- byte-level pretokenizer ---
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    # Add this line to your tokenizer setup
    tokenizer.decoder = decoders.ByteLevel()

    # --------------------------------------------------------
    # 3. Trainer configuration
    # --------------------------------------------------------
    print("3. Configuring BPE trainer...")

    special_tokens = [
        "<pad>", "<unk>", "<s>", "</s>",
        "<en>", "<vi>"   # language tags
    ]

    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=special_tokens,
        initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        show_progress=True,
    )
    # --------------------------------------------------------
    # 4. Training
    # --------------------------------------------------------
    print(f"4. Training tokenizer on corpus (vocab={vocab_size})...")
    tokenizer.train_from_iterator(corpus_generator(), trainer=trainer)

    # --------------------------------------------------------
    # 5. Post-processing (add <s> and </s> automatically)
    # --------------------------------------------------------
    tokenizer.post_processor = processors.TemplateProcessing(
        single="<s> $A </s>",
        pair="<s> $A </s> <s> $B </s>",
        special_tokens=[
            ("<s>", tokenizer.token_to_id("<s>")),
            ("</s>", tokenizer.token_to_id("</s>")),
        ],
    )

    # --------------------------------------------------------
    # 6. Wrap as HF tokenizer
    # --------------------------------------------------------
    wrapped = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token="<s>",
        eos_token="</s>",
        pad_token="<pad>",
        unk_token="<unk>",
        additional_special_tokens=["<en>", "<vi>"],
        model_max_length=512,
    )

    # --------------------------------------------------------
    # 7. Save
    # --------------------------------------------------------
    print(f"\n5. Saving tokenizer to: {save_path}/")
    wrapped.save_pretrained(save_path)

    # --------------------------------------------------------
    # 8. Test samples
    # --------------------------------------------------------
    print("\n6. Testing tokenizer with sample medical text:")

    tests = [
        "Với cả hai phương pháp, không có hai bản in nào giống bản nào, nhưng cả hai đều cho ra hình ảnh ấn tượng."
    ]

    for t in tests:
        print("\nText:", t)
        print("Tokens:", wrapped.tokenize(t))
        print("IDs:", wrapped.encode(t))

    print("\n" + "=" * 70)
    print("✅ Tokenizer trained successfully!")
    print(f"   Final vocab size: {wrapped.vocab_size}")
    print(f"   Saved to: {save_path}/")
    print("=" * 70)

    return wrapped
tokenizer = train_medical_tokenizer( dataset, vocab_size=40000, save_path="medical_envi_tokenizer" )

        TRAINING EN–VI MEDICAL BYTE-LEVEL TOKENIZER

1. Preparing training corpus...
2. Initializing byte-level BPE tokenizer...
3. Configuring BPE trainer...
4. Training tokenizer on corpus (vocab=40000)...




5. Saving tokenizer to: medical_envi_tokenizer/

6. Testing tokenizer with sample medical text:

Text: Với cả hai phương pháp, không có hai bản in nào giống bản nào, nhưng cả hai đều cho ra hình ảnh ấn tượng.
Tokens: ['vá»Ľi', 'Ġcáº£', 'Ġhai', 'ĠphÆ°Æ¡ng', 'ĠphÃ¡p', ',', 'ĠkhÃ´ng', 'ĠcÃ³', 'Ġhai', 'Ġbáº£n', 'Ġin', 'ĠnÃło', 'Ġgiá»ĳng', 'Ġbáº£n', 'ĠnÃło', ',', 'ĠnhÆ°ng', 'Ġcáº£', 'Ġhai', 'ĠÄĳá»ģu', 'Ġcho', 'Ġra', 'ĠhÃ¬nh', 'Ġáº£nh', 'Ġáº¥n', 'ĠtÆ°á»£ng', '.']
IDs: [2, 10373, 550, 803, 1395, 1132, 17, 385, 362, 803, 802, 316, 784, 1356, 802, 784, 17, 629, 550, 803, 1421, 433, 487, 907, 1291, 2178, 1811, 19, 3]

✅ Tokenizer trained successfully!
   Final vocab size: 40000
   Saved to: medical_envi_tokenizer/


In [5]:
text = dataset["train"][0]["vi"]
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print("Text:", text)
print("Tokens:", tokens)
print("IDs:", ids)


text = dataset["train"][0]["en"]
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print("Text:", text)
print("Tokens:", tokens)
print("IDs:", ids)


Text: "nơi giáo dục và dìu dắt tôi để tôi có thể phụng sự nền Cộng Hòa"
Tokens: ['"', 'nÆ¡i', 'ĠgiÃ¡o', 'Ġdá»¥c', 'ĠvÃł', 'ĠdÃ¬', 'u', 'Ġdáº¯t', 'ĠtÃ´i', 'ĠÄĳá»ĥ', 'ĠtÃ´i', 'ĠcÃ³', 'Ġthá»ĥ', 'Ġphá»¥ng', 'Ġsá»±', 'Ġná»ģn', 'Ġcá»Ļng', 'ĠhÃ²a', '"']
IDs: [7, 17277, 1249, 2514, 321, 8591, 90, 8809, 381, 527, 381, 362, 488, 14180, 555, 2480, 1519, 2058, 7]
Text: "education and leading me to providing service to the Republic."
Tokens: ['"', 'educ', 'ation', 'Ġand', 'Ġleading', 'Ġme', 'Ġto', 'Ġproviding', 'Ġservice', 'Ġto', 'Ġthe', 'Ġrepublic', '."']
IDs: [7, 24241, 473, 332, 5302, 519, 315, 8660, 3251, 315, 281, 3541, 1097]


In [6]:
# Encode then decode
text = "Chiếc cúp trong lòng toi là chiếc cúp quý giá nhất"
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
print(f"Original: {text}")
print(f"encoded: {encoded}")
print(f"Decoded:  {decoded}")
print(f"Match: {text == decoded}")

Original: Chiếc cúp trong lòng toi là chiếc cúp quý giá nhất
encoded: [2, 30438, 6753, 403, 2580, 26510, 330, 1513, 6753, 3020, 1166, 702, 3]
Decoded:  <s>chiếc cúp trong lòng toi là chiếc cúp quý giá nhất</s>
Match: False


In [7]:
from torch.utils.data import Dataset
import torch  # You also need this for torch.tensor inside __getitem__
# ============================================================================
# DATASET CLASS
# ============================================================================
class TranslationDataset(Dataset):
    """Dataset for EN-VI medical translation"""
    def __init__(self, dataset, tokenizer, max_len=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Tokenize
        src_ids = self.tokenizer.encode(item['en'], add_special_tokens=True, max_length=self.max_len, truncation=True)
        tgt_ids = self.tokenizer.encode(item['vi'], add_special_tokens=True, max_length=self.max_len, truncation=True)
        
        return {
            'src': torch.tensor(src_ids, dtype=torch.long),
            'tgt': torch.tensor(tgt_ids, dtype=torch.long)
        }

In [8]:
def collate_fn(batch, pad_id=0):
    """Collate function with dynamic padding"""
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]
    
    # Pad sequences
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=pad_id)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=pad_id)
    
    return {
        'src': src_padded,
        'tgt': tgt_padded
    }



In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import numpy as np

# ============================================================================
# POSITIONAL ENCODING
# ============================================================================
class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for transformer"""
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [10]:
# ============================================================================
# MULTI-HEAD ATTENTION
# ============================================================================
class MultiHeadAttention(nn.Module):
    """Multi-head self-attention mechanism"""
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def split_heads(self, x):
        """Split into multiple heads: (batch, seq_len, d_model) -> (batch, num_heads, seq_len, d_k)"""
        batch_size, seq_len, d_model = x.size()
        return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        """Combine heads: (batch, num_heads, seq_len, d_k) -> (batch, seq_len, d_model)"""
        batch_size, num_heads, seq_len, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
    
    def forward(self, query, key, value, mask=None):
        """
        Args:
            query, key, value: (batch_size, seq_len, d_model)
            mask: (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len)
        """
        batch_size = query.size(0)
        
        # Linear projections and split heads
        Q = self.split_heads(self.W_q(query))  # (batch, num_heads, seq_len, d_k)
        K = self.split_heads(self.W_k(key))
        V = self.split_heads(self.W_v(value))
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        attn_output = torch.matmul(attn_weights, V)  # (batch, num_heads, seq_len, d_k)
        
        # Combine heads and final linear
        attn_output = self.combine_heads(attn_output)  # (batch, seq_len, d_model)
        output = self.W_o(attn_output)
        
        return output



In [11]:
# ============================================================================
# FEED FORWARD NETWORK
# ============================================================================
class FeedForward(nn.Module):
    """Position-wise feed-forward network"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))



In [12]:
# ============================================================================
# ENCODER LAYER
# ============================================================================
class EncoderLayer(nn.Module):
    """Single encoder layer with self-attention and feed-forward"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        # Self-attention with residual connection
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))
        
        return x



In [13]:
# ============================================================================
# DECODER LAYER
# ============================================================================
class DecoderLayer(nn.Module):
    """Single decoder layer with self-attention, cross-attention, and feed-forward"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
    
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # Self-attention on target
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Cross-attention on encoder output
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        
        # Feed-forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout3(ff_output))
        
        return x



In [14]:
# ============================================================================
# TRANSFORMER MODEL
# ============================================================================
class TransformerTranslator(nn.Module):
    """Complete Transformer model for EN-VI medical translation"""
    def __init__(
        self,
        vocab_size,
        d_model=512,
        num_heads=8,
        num_encoder_layers=6,
        num_decoder_layers=6,
        d_ff=2048,
        max_len=512,
        dropout=0.1,
        pad_idx=0
    ):
        super().__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Embeddings
        self.encoder_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.decoder_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        
        # Positional encoding
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        # Encoder and Decoder stacks
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Output projection
        self.output_projection = nn.Linear(d_model, vocab_size)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize weights using Xavier uniform"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def make_src_mask(self, src):
        """Create padding mask for source: (batch, 1, 1, src_len)"""
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_tgt_mask(self, tgt):
        """Create causal mask for target: (batch, 1, tgt_len, tgt_len)"""
        batch_size, tgt_len = tgt.size()
        
        # Padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)  # (batch, 1, 1, tgt_len)
        
        # Causal mask (lower triangular)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_sub_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(0)  # (1, 1, tgt_len, tgt_len)
        
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return tgt_mask
    
    def encode(self, src, src_mask):
        """Encode source sequence"""
        # Embedding + positional encoding
        x = self.encoder_embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through encoder layers
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        """Decode target sequence"""
        # Embedding + positional encoding
        x = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through decoder layers
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        
        return x
    
    def forward(self, src, tgt):
        """
        Args:
            src: (batch_size, src_len)
            tgt: (batch_size, tgt_len)
        Returns:
            output: (batch_size, tgt_len, vocab_size)
        """
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        
        encoder_output = self.encode(src, src_mask)
        decoder_output = self.decode(tgt, encoder_output, src_mask, tgt_mask)
        
        output = self.output_projection(decoder_output)
        return output



In [15]:
# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_epoch(model, dataloader, optimizer, criterion, device, grad_clip=1.0):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    pbar = tqdm(dataloader, desc="Training")
    for batch in pbar:
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)
        
        # Teacher forcing: use tgt[:-1] as input, predict tgt[1:]
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(src, tgt_input)  # (batch, tgt_len-1, vocab_size)
        
        # Compute loss
        output = output.reshape(-1, output.size(-1))
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)
            
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            output = model(src, tgt_input)
            
            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(output, tgt_output)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [16]:
!pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [17]:
def translate_sentence(model, tokenizer, sentence, device, max_len=100):
    model.eval()

    # Tokenize input
    encoded = tokenizer(
        sentence,
        return_tensors="pt",
        padding=False,
        truncation=True
    )
    src = encoded["input_ids"].to(device)

    # Decode using greedy search
    pred_ids = greedy_decode(model, src, tokenizer, max_len=max_len)[0].tolist()

    # Trim at EOS
    if tokenizer.eos_token_id in pred_ids:
        pred_ids = pred_ids[:pred_ids.index(tokenizer.eos_token_id)]

    # Convert to text
    translation = tokenizer.decode(pred_ids, skip_special_tokens=True)
    return translation


In [18]:
import random

def get_random_test_examples(dataset, n=5):
    indices = random.sample(range(len(dataset['test'])), n)
    return [dataset['test'][i]['en'] for i in indices]

# Example: Sample 5 random English sentences
medical_examples = get_random_test_examples(dataset, n=5)

print(medical_examples)


['- No, I thought that was you.', 'In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.', "So why should I trust you, Because you've already betrayed me.", 'And we asked other people, how fast were the cars going when they smashed into each other?', 'Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.']


In [19]:
# ============================================================================
# FULL TRAINING SCRIPT (CLEAN + FIXED BLEU + FIXED DECODE)
# ============================================================================

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import sacrebleu
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import PreTrainedTokenizerFast


# ============================================================================
# Greedy Decode
# ============================================================================
def greedy_decode(model, src, tokenizer, max_len=100):
    model.eval()
    device = src.device

    sos_id = tokenizer.bos_token_id
    eos_id = tokenizer.eos_token_id
    pad_id = tokenizer.pad_token_id

    # Create source mask
    src_mask = model.make_src_mask(src)

    with torch.no_grad():

        # Encode source sequence
        memory = model.encode(src, src_mask)

        # Start decoder input with <sos>
        ys = torch.full(
            (src.size(0), 1),
            fill_value=sos_id,
            dtype=torch.long,
            device=device
        )

        for _ in range(max_len):

            # Create target/causal mask
            tgt_mask = model.make_tgt_mask(ys)

            # Decode
            out = model.decode(ys, memory, src_mask, tgt_mask)

            # Project to vocab & pick top token
            logits = model.output_projection(out[:, -1])  # last step
            next_word = torch.argmax(logits, dim=-1).unsqueeze(1)

            # Append
            ys = torch.cat([ys, next_word], dim=1)

            # Stop if all sentences predicted EOS
            if (next_word == eos_id).all():
                break

    return ys


# ============================================================================
# Proper BLEU Computation (Correct sacrebleu Format)
# ============================================================================

def compute_bleu(model, dataloader, tokenizer, device):
    model.eval()

    hypotheses = []
    reference_stream = []

    for batch in dataloader:

        # match collate_fn keys
        src = batch["src"].to(device)
        tgt = batch["tgt"]

        # Greedy decode predictions
        pred_ids = greedy_decode(model, src, tokenizer, max_len=100)

        for i in range(src.size(0)):

            # ----- Decode Prediction -----
            pred = pred_ids[i].tolist()
            if tokenizer.eos_token_id in pred:
                pred = pred[:pred.index(tokenizer.eos_token_id)]
            pred_text = tokenizer.decode(pred, skip_special_tokens=True)

            # ----- Decode Reference -----
            ref = tgt[i].tolist()
            if tokenizer.eos_token_id in ref:
                ref = ref[:ref.index(tokenizer.eos_token_id)]
            ref_text = tokenizer.decode(ref, skip_special_tokens=True)

            hypotheses.append(pred_text)
            reference_stream.append(ref_text)

    bleu = sacrebleu.corpus_bleu(hypotheses, [reference_stream])
    return bleu.score


# ============================================================================
# MAIN TRAINING LOOP
# ============================================================================

def train():
    import torch.optim as optim

    # Hyperparameters
    BATCH_SIZE = 64
    NUM_EPOCHS = 20
    LEARNING_RATE = 3e-4
    D_MODEL = 256
    NUM_HEADS = 8
    NUM_LAYERS = 4
    D_FF = 1024
    DROPOUT = 0.1
    MAX_LEN = 128
    WARMUP_STEPS = 4000

    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load tokenizer
    tokenizer = PreTrainedTokenizerFast.from_pretrained("medical_envi_tokenizer")

    # Datasets
    train_dataset = TranslationDataset(dataset["train"], tokenizer, max_len=MAX_LEN)
    val_dataset = TranslationDataset(dataset["validation"], tokenizer, max_len=MAX_LEN)
    test_dataset = TranslationDataset(dataset["test"], tokenizer, max_len=MAX_LEN)
    
    # ✅ ADD THIS: Track metrics
    history = {
        'train_loss': [],
        'val_loss': [],
        'learning_rate': []
    }
    # Dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
        num_workers=2
    )

    # Model
    model = TransformerTranslator(
        vocab_size=tokenizer.vocab_size,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        num_encoder_layers=NUM_LAYERS,
        num_decoder_layers=NUM_LAYERS,
        d_ff=D_FF,
        max_len=MAX_LEN,
        dropout=DROPOUT,
        pad_idx=tokenizer.pad_token_id
    ).to(device)

    print(f"\nModel Parameters: {sum(p.numel() for p in model.parameters()):,}")

    # Loss + Optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    optimizer = optim.Adam(
        model.parameters(),
        lr=LEARNING_RATE,
        betas=(0.9, 0.98),
        eps=1e-9
    )

    scheduler = ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.5, 
    patience=2,
    verbose=True
)

    # Training
    best_val_loss = float("inf")

    for epoch in range(NUM_EPOCHS):
        print("\n" + "="*60)
        print(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
        print("="*60)

        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss = evaluate(model, val_loader, criterion, device)

        print(f"\nTrain Loss: {train_loss:.4f}")
        print(f"Val Loss  : {val_loss:.4f}")

        scheduler.step(val_loss)

        current_lr = optimizer.param_groups[0]['lr']
        # ✅ Store metrics
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['learning_rate'].append(current_lr)
        
        print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss  : {val_loss:.4f}")
        print(f"Learning Rate: {current_lr:.6f}")
        print("\nSample Translations:")
        for s in medical_examples:
            translation = translate_sentence(model, tokenizer, s, device)
            print(f"EN: {s}")
            print(f"VI: {translation}")
            print("---")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "val_loss": val_loss,
            }, "best_medical_translator.pt")

            print(f"✓ Saved best model (val_loss={val_loss:.4f})")

    print("\nTraining complete!")

    # ============================================================================
    # BLEU Evaluation
    # ============================================================================

    print("\nCalculating BLEU on test set...")
    bleu_score = compute_bleu(model, test_loader, tokenizer, device)
    print(f"\nBLEU Score: {bleu_score:.2f}")
    print("="*60)


# Run training
if __name__ == "__main__":
    train()


Using device: cuda

Model Parameters: 38,132,800





Epoch 1/20


Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:03<00:00,  7.34it/s, loss=3.0453]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 4.2224
Val Loss  : 2.6909

Epoch 1/20
Train Loss: 4.2224
Val Loss  : 2.6909
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ anh là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba quyền lực tây ban nha phóng ra không khí để cung cấp công dân của berlin.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi nên tin tưởng anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, làm sao nhanh lên khi họ bị bắt vào mỗi người?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: loài chim và một số loài sát nhân có thể xác định, nhưng thay vì sự liên hệ của cha mẹ, sự liên hệ của họ đã được

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:00<00:00,  7.35it/s, loss=2.7137]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.7582
Val Loss  : 2.1844

Epoch 2/20
Train Loss: 2.7582
Val Loss  : 2.1844
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba cường quốc phương tây khởi động không quân berlin để cung cấp cho công dân berlin bằng không quân berlin.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy nên tại sao tôi tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, nhanh như thế nào khi họ bị đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: loài chim và một số con bò sát có sự quyết định về di truyền của họ, nhưng thay vì giới tính được xác đ

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:02<00:00,  7.34it/s, loss=2.2418]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.4565
Val Loss  : 2.0291

Epoch 3/20
Train Loss: 2.4565
Val Loss  : 2.0291
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây đã phóng lên không khí berlin để cung cấp công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì vậy, tại sao tôi tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, nhanh đến mức nào khi họ bị phá vỡ thành nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: loài chim và một số loài bò sát có sự quan hệ tình dục xác định, nhưng thay vì quan hệ tình dục được xác địn

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:01<00:00,  7.34it/s, loss=2.2281]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.3300
Val Loss  : 1.9582

Epoch 4/20
Train Loss: 2.3300
Val Loss  : 1.9582
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: đáp lại, ba cường quốc phương tây ra mắt không khí berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi lại tin anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe nhanh đến mức nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: các loài chim và một số loài bò sát có tính hệ sinh dục của chúng xác định, nhưng thay vì tình dục được xác 

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:13<00:00,  7.30it/s, loss=2.5383]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.2560
Val Loss  : 1.9122

Epoch 5/20
Train Loss: 2.2560
Val Loss  : 1.9122
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là cô.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba cường quốc phương tây đã khởi động chuyến không quân berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi phải tin tưởng anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe hơi nhanh thế nào khi họ đập vỡ nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: loài chim và một số bò sát có tính giới tính của họ xác định, nhưng thay vì giới tính 

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:18<00:00,  7.28it/s, loss=2.0654]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.2033
Val Loss  : 1.8796

Epoch 6/20
Train Loss: 2.2033
Val Loss  : 1.8796
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là cô.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba cường quốc phía tây phóng lên không khí berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì thế tại sao tôi tin cô, bởi vì cô đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, những chiếc xe này đi nhanh thế nào khi chúng bị đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có giới tính về mặt gen xác định, nhưng thay vì quan hệ tình dục 

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:20<00:00,  7.28it/s, loss=2.0615]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.1617
Val Loss  : 1.8522

Epoch 7/20
Train Loss: 2.1617
Val Loss  : 1.8522
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba cường quốc phương tây đã tung ra việc nâng không khí berlin cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy sao tôi nên tin anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe sẽ nhanh cỡ nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có tính giới tính quyết định về gen, nhưng thay vì giới tính được bố quyết tâm

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=2.4572]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.1285
Val Loss  : 1.8356

Epoch 8/20
Train Loss: 2.1285
Val Loss  : 1.8356
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: đáp lại, ba cường quốc phương tây khởi động hàng không berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì thế tại sao tôi nên tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe nhanh đến mức nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: những con chim và một số loài bò sát có tính giới tính của chúng, nhưng thay vì giới tính được xác định 

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=2.1770]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0998
Val Loss  : 1.8201

Epoch 9/20
Train Loss: 2.0998
Val Loss  : 1.8201
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: đáp lại, ba cường quốc phương tây phóng chiếc máy bay áp berlin lên cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì thế tại sao tôi phải tin cô, vì cô đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, những chiếc xe đó chạy nhanh thế nào khi họ đập lẫn nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát có tính dục của chúng xác định về mặt di truyền, nhưng thay

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:18<00:00,  7.28it/s, loss=2.0748]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0745
Val Loss  : 1.8087

Epoch 10/20
Train Loss: 2.0745
Val Loss  : 1.8087
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây đã phóng không quân berlin lên để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì vậy, tại sao tôi phải tin anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, xe tăng sẽ nhanh như thế nào khi họ đập nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có tính di truyền tình dục của chúng được xác định, nhưng thay vì tình d

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:22<00:00,  7.27it/s, loss=2.2020]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0507
Val Loss  : 1.7876

Epoch 11/20
Train Loss: 2.0507
Val Loss  : 1.7876
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp ứng, ba cường quốc phương tây khởi động chuyến bay berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi phải tin tưởng anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, xe điện sẽ nhanh như thế nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát có tính chất về mặt di truyền, nhưng thay vì giới tính được xác định

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=2.0929]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0324
Val Loss  : 1.7785

Epoch 12/20
Train Loss: 2.0324
Val Loss  : 1.7785
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để phản ứng, ba cường quốc phương tây khởi động hàng không berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi nên tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe sẽ nhanh như thế nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: nhóm chim và một số loài bò sát đã xác định về mặt di truyền, nhưng thay vì giới tính được xác định 

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:21<00:00,  7.27it/s, loss=1.9837]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0223
Val Loss  : 1.7770

Epoch 13/20
Train Loss: 2.0223
Val Loss  : 1.7770
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp trả, ba cường quốc phương tây khởi động cuộc không kích berlin để cung cấp cho công dân berlin bằng cách không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy sao tôi nên tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe chạy nhanh đến mức nào khi họ đổ nát nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát có tính xác định về mặt di truyền, nhưng thay vì tình dục được xác

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:22<00:00,  7.27it/s, loss=2.0381]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 2.0085
Val Loss  : 1.7677

Epoch 14/20
Train Loss: 2.0085
Val Loss  : 1.7677
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây phóng lên berlin air chưa được cung cấp cho công dân berlin.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi lại tin tưởng anh, vì anh đã phản bội tôi rồi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, những chiếc xe chạy nhanh đến mức nào khi họ đập nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát đã xác định về mặt di truyền, nhưng thay vì tình dục được cha xác địn

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=1.9893]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9948
Val Loss  : 1.7556

Epoch 15/20
Train Loss: 1.9948
Val Loss  : 1.7556
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây đã phóng lên không khí berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vì vậy tại sao tôi phải tin anh, vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, những chiếc xe sẽ nhanh đến mức nào khi họ đập vỡ nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát đã xác định về mặt gen, nhưng thay vì giới tính được xác định b

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:18<00:00,  7.28it/s, loss=1.8900]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9778
Val Loss  : 1.7529

Epoch 16/20
Train Loss: 1.9778
Val Loss  : 1.7529
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp trả, ba cường quốc phương tây phóng không khí berlin lên để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi phải tin anh, vì anh đã phản bội tôi rồi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, xe hơi sẽ nhanh như thế nào khi họ đập nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có tính chất tình dục của chúng xác định về mặt di truyền, nhưng thay vì tình dụ

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=2.1811]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9586
Val Loss  : 1.7302

Epoch 17/20
Train Loss: 1.9586
Val Loss  : 1.7302
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây phóng sự gia tăng không quân berlin để cung cấp cho công dân berlin bằng không.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi phải tin anh, bởi vì anh đã phản bội tôi rồi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, xe hơi nhanh đến mức nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: đàn chim và một số bò sát đã xác định về mặt di truyền, nhưng thay vì tình dục được bố

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=2.4310]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9503
Val Loss  : 1.7377

Epoch 18/20
Train Loss: 1.9503
Val Loss  : 1.7377
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây phóng máy bay berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi nên tin anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, xe hơi nhanh đến mức nào khi họ đập vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có tình dục quyết định về mặt di truyền, nhưng thay vì tình dục được quyết định bởi bố

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:18<00:00,  7.28it/s, loss=2.1041]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9417
Val Loss  : 1.7264

Epoch 19/20
Train Loss: 1.9417
Val Loss  : 1.7264
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi tưởng đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây đã phóng lên không khí berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi nên tin anh, bởi vì anh đã phản bội tôi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi người khác, xe đi nhanh đến đâu khi họ đập nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số bò sát có tính tình dục quyết tâm di truyền, nhưng thay vì giới tính được xác định bởi bố, tì

Training:   0%|          | 0/14992 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training: 100%|██████████| 14992/14992 [34:19<00:00,  7.28it/s, loss=1.9962]
Evaluating:   0%|          | 0/172 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS


Train Loss: 1.9317
Val Loss  : 1.7248

Epoch 20/20
Train Loss: 1.9317
Val Loss  : 1.7248
Learning Rate: 0.000300

Sample Translations:
EN: - No, I thought that was you.
VI: - không, tôi nghĩ đó là anh.
---
EN: In response, the three Western powers launch the Berlin Airlift to supply the citizens of Berlin by air.
VI: để đáp lại, ba cường quốc phương tây đã tung ra cuộc không kích berlin để cung cấp cho công dân berlin bằng không khí.
---
EN: So why should I trust you, Because you've already betrayed me.
VI: vậy tại sao tôi nên tin anh, bởi vì anh đã phản bội tôi rồi.
---
EN: And we asked other people, how fast were the cars going when they smashed into each other?
VI: và chúng tôi hỏi những người khác, những chiếc xe này đi nhanh đến mức nào khi bị phá vỡ vào nhau?
---
EN: Birds and some reptiles have their sex genetically determined, but instead of the sex being determined by dad, their sex is determined by mom.
VI: chim và một số loài bò sát có tính tình dục được xác định di truyền,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



BLEU Score: 38.05
