<a href="https://www.kaggle.com/code/nguynvnln22028281/btl-nlp-medical-fine-tune?scriptVersionId=286612767" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
from datasets import load_dataset
from datasets import DatasetDict


# Load data
data_files = {
    "train": "/kaggle/input/btl-nlp-cleandata/train_cleaned.jsonl",
    "test": "/kaggle/input/btl-nlp-cleandata/test_cleaned.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

# Create train/validation split (90/10 or 95/5)
train_val_split = dataset['train'].train_test_split(
    test_size=0.05,  # 5% for validation
    seed=42,
    shuffle=True  # IMPORTANT: Shuffle the data
)

# Reassemble into final dataset structure
dataset = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],  # Note: this is validation, not test
    'test': dataset['test']
})

print("="*60)
print("Final dataset structure:")
print(dataset)
print(f"  Train: {len(dataset['train'])} pairs")
print(f"  Validation: {len(dataset['validation'])} pairs")
print(f"  Test: {len(dataset['test'])} pairs")
print("="*60)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Final dataset structure:
DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 305942
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 16103
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 2793
    })
})
  Train: 305942 pairs
  Validation: 16103 pairs
  Test: 2793 pairs


In [2]:
def remove_train_overlaps(dataset):
    print("\n" + "="*60)
    print("REMOVING TRAIN ITEMS THAT ALSO APPEAR IN TEST (HF DATASET)")
    print("="*60)

    # Extract English sentences from test split
    test_en = set(dataset['test']['en'])   # fast and HF compatible

    old_size = len(dataset['train'])

    # Use .filter() to keep only examples not in test
    dataset['train'] = dataset['train'].filter(
        lambda ex: ex['en'] not in test_en
    )

    new_size = len(dataset['train'])
    removed = old_size - new_size

    print(f"Original train size: {old_size}")
    print(f"New train size     : {new_size}")
    print(f"Removed from train : {removed}")

    if removed > 0:
        print("‚úÖ Train cleaned and test set unchanged.")
    else:
        print("No overlaps found.")

    return dataset
dataset = remove_train_overlaps(dataset)

# Reassemble final dataset
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': train_val_split['test'],  # your val set
    'test': dataset['test']
})



REMOVING TRAIN ITEMS THAT ALSO APPEAR IN TEST (HF DATASET)


Filter:   0%|          | 0/305942 [00:00<?, ? examples/s]

Original train size: 305942
New train size     : 304286
Removed from train : 1656
‚úÖ Train cleaned and test set unchanged.


In [3]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("/kaggle/input/general-envi-tokenizer")


In [4]:
from torch.utils.data import Dataset
import torch  # You also need this for torch.tensor inside __getitem__
# ============================================================================
# DATASET CLASS
# ============================================================================
class TranslationDataset(Dataset):
    """Dataset for EN-VI medical translation"""
    def __init__(self, dataset, tokenizer, max_len=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Tokenize
        src_ids = self.tokenizer.encode(item['en'], add_special_tokens=True, max_length=self.max_len, truncation=True)
        tgt_ids = self.tokenizer.encode(item['vi'], add_special_tokens=True, max_length=self.max_len, truncation=True)
        
        return {
            'src': torch.tensor(src_ids, dtype=torch.long),
            'tgt': torch.tensor(tgt_ids, dtype=torch.long)
        }

In [5]:
def collate_fn(batch, pad_id=0):
    """Collate function with dynamic padding"""
    src_batch = [item['src'] for item in batch]
    tgt_batch = [item['tgt'] for item in batch]
    
    # Pad sequences
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=pad_id)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=pad_id)
    
    return {
        'src': src_padded,
        'tgt': tgt_padded
    }



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import numpy as np

# ============================================================================
# POSITIONAL ENCODING
# ============================================================================
class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for transformer"""
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [7]:
# ============================================================================
# MULTI-HEAD ATTENTION
# ============================================================================
class MultiHeadAttention(nn.Module):
    """Multi-head self-attention mechanism"""
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
    
    def split_heads(self, x):
        """Split into multiple heads: (batch, seq_len, d_model) -> (batch, num_heads, seq_len, d_k)"""
        batch_size, seq_len, d_model = x.size()
        return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        """Combine heads: (batch, num_heads, seq_len, d_k) -> (batch, seq_len, d_model)"""
        batch_size, num_heads, seq_len, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
    
    def forward(self, query, key, value, mask=None):
        """
        Args:
            query, key, value: (batch_size, seq_len, d_model)
            mask: (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len)
        """
        batch_size = query.size(0)
        
        # Linear projections and split heads
        Q = self.split_heads(self.W_q(query))  # (batch, num_heads, seq_len, d_k)
        K = self.split_heads(self.W_k(key))
        V = self.split_heads(self.W_v(value))
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        attn_output = torch.matmul(attn_weights, V)  # (batch, num_heads, seq_len, d_k)
        
        # Combine heads and final linear
        attn_output = self.combine_heads(attn_output)  # (batch, seq_len, d_model)
        output = self.W_o(attn_output)
        
        return output



In [8]:
# ============================================================================
# FEED FORWARD NETWORK
# ============================================================================
class FeedForward(nn.Module):
    """Position-wise feed-forward network"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))



In [9]:
# ============================================================================
# ENCODER LAYER
# ============================================================================
class EncoderLayer(nn.Module):
    """Single encoder layer with self-attention and feed-forward"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        # Self-attention with residual connection
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout2(ff_output))
        
        return x



In [10]:
# ============================================================================
# DECODER LAYER
# ============================================================================
class DecoderLayer(nn.Module):
    """Single decoder layer with self-attention, cross-attention, and feed-forward"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
    
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # Self-attention on target
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Cross-attention on encoder output
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        
        # Feed-forward
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout3(ff_output))
        
        return x



In [11]:
# ============================================================================
# TRANSFORMER MODEL
# ============================================================================
class TransformerTranslator(nn.Module):
    """Complete Transformer model for EN-VI medical translation"""
    def __init__(
        self,
        vocab_size,
        d_model=512,
        num_heads=8,
        num_encoder_layers=6,
        num_decoder_layers=6,
        d_ff=2048,
        max_len=512,
        dropout=0.1,
        pad_idx=0
    ):
        super().__init__()
        
        self.d_model = d_model
        self.pad_idx = pad_idx
        
        # Embeddings
        self.encoder_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        self.decoder_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        
        # Positional encoding
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
        
        # Encoder and Decoder stacks
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_encoder_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_decoder_layers)
        ])
        
        # Output projection
        self.output_projection = nn.Linear(d_model, vocab_size)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize weights using Xavier uniform"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    
    def make_src_mask(self, src):
        """Create padding mask for source: (batch, 1, 1, src_len)"""
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_tgt_mask(self, tgt):
        """Create causal mask for target: (batch, 1, tgt_len, tgt_len)"""
        batch_size, tgt_len = tgt.size()
        
        # Padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(2)  # (batch, 1, 1, tgt_len)
        
        # Causal mask (lower triangular)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_sub_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(0)  # (1, 1, tgt_len, tgt_len)
        
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return tgt_mask
    
    def encode(self, src, src_mask):
        """Encode source sequence"""
        # Embedding + positional encoding
        x = self.encoder_embedding(src) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through encoder layers
        for layer in self.encoder_layers:
            x = layer(x, src_mask)
        
        return x
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        """Decode target sequence"""
        # Embedding + positional encoding
        x = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through decoder layers
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        
        return x
    
    def forward(self, src, tgt):
        """
        Args:
            src: (batch_size, src_len)
            tgt: (batch_size, tgt_len)
        Returns:
            output: (batch_size, tgt_len, vocab_size)
        """
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        
        encoder_output = self.encode(src, src_mask)
        decoder_output = self.decode(tgt, encoder_output, src_mask, tgt_mask)
        
        output = self.output_projection(decoder_output)
        return output



In [12]:
# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_epoch(model, dataloader, optimizer, criterion, device, grad_clip=1.0):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    pbar = tqdm(dataloader, desc="Training")
    for batch in pbar:
        src = batch['src'].to(device)
        tgt = batch['tgt'].to(device)
        
        # Teacher forcing: use tgt[:-1] as input, predict tgt[1:]
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(src, tgt_input)  # (batch, tgt_len-1, vocab_size)
        
        # Compute loss
        output = output.reshape(-1, output.size(-1))
        tgt_output = tgt_output.reshape(-1)
        loss = criterion(output, tgt_output)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)
            
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            output = model(src, tgt_input)
            
            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt_output.reshape(-1)
            loss = criterion(output, tgt_output)
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [13]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [14]:
def translate_sentence(model, tokenizer, sentence, device, max_len=100):
    model.eval()

    # Tokenize input
    encoded = tokenizer(
        sentence,
        return_tensors="pt",
        padding=False,
        truncation=True
    )
    src = encoded["input_ids"].to(device)

    # Decode using greedy search
    pred_ids = greedy_decode(model, src, tokenizer, max_len=max_len)[0].tolist()

    # Trim at EOS
    if tokenizer.eos_token_id in pred_ids:
        pred_ids = pred_ids[:pred_ids.index(tokenizer.eos_token_id)]

    # Convert to text
    translation = tokenizer.decode(pred_ids, skip_special_tokens=True)
    return translation


In [15]:
import random

def get_random_test_examples(dataset, n=5):
    indices = random.sample(range(len(dataset['test'])), n)
    return [dataset['test'][i]['en'] for i in indices]

# Example: Sample 5 random English sentences
medical_examples = get_random_test_examples(dataset, n=5)

print(medical_examples)


['A cross-sectional descriptive study was performed to assess the hearing status of armored tank soldiers.', 'Pherochromocytomas is a rare disease in children with the estimated incidence is about 1 per 50.000 to 100.000 children.', 'The most common lesions are: frosted glass (91.5%), solidified (22.6%), interstitial thickening (14.2%).', 'Health related quality of life score and standard deviation of all 324 study subjects were 0.874 ¬± 0.216, respectively.', 'Surgical outcomes of cerebellopontine angle tumors']


In [16]:
import sacrebleu

# ============================================================================
# Greedy Decode
# ============================================================================
def greedy_decode(model, src, tokenizer, max_len=100):
    model.eval()
    device = src.device

    sos_id = tokenizer.bos_token_id
    eos_id = tokenizer.eos_token_id
    pad_id = tokenizer.pad_token_id

    # Create source mask
    src_mask = model.make_src_mask(src)

    with torch.no_grad():

        # Encode source sequence
        memory = model.encode(src, src_mask)

        # Start decoder input with <sos>
        ys = torch.full(
            (src.size(0), 1),
            fill_value=sos_id,
            dtype=torch.long,
            device=device
        )

        for _ in range(max_len):

            # Create target/causal mask
            tgt_mask = model.make_tgt_mask(ys)

            # Decode
            out = model.decode(ys, memory, src_mask, tgt_mask)

            # Project to vocab & pick top token
            logits = model.output_projection(out[:, -1])  # last step
            next_word = torch.argmax(logits, dim=-1).unsqueeze(1)

            # Append
            ys = torch.cat([ys, next_word], dim=1)

            # Stop if all sentences predicted EOS
            if (next_word == eos_id).all():
                break

    return ys




In [17]:
# ============================================================================
# Proper BLEU Computation (Correct sacrebleu Format)
# ============================================================================

def compute_bleu(model, dataloader, tokenizer, device):
    model.eval()

    hypotheses = []
    reference_stream = []

    for batch in dataloader:

        # match collate_fn keys
        src = batch["src"].to(device)
        tgt = batch["tgt"]

        # Greedy decode predictions
        pred_ids = greedy_decode(model, src, tokenizer, max_len=100)

        for i in range(src.size(0)):

            # ----- Decode Prediction -----
            pred = pred_ids[i].tolist()
            if tokenizer.eos_token_id in pred:
                pred = pred[:pred.index(tokenizer.eos_token_id)]
            pred_text = tokenizer.decode(pred, skip_special_tokens=True)

            # ----- Decode Reference -----
            ref = tgt[i].tolist()
            if tokenizer.eos_token_id in ref:
                ref = ref[:ref.index(tokenizer.eos_token_id)]
            ref_text = tokenizer.decode(ref, skip_special_tokens=True)

            hypotheses.append(pred_text)
            reference_stream.append(ref_text)

    bleu = sacrebleu.corpus_bleu(hypotheses, [reference_stream])
    return bleu.score



In [18]:

import torch.optim as optim
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR

# Fine-tuning Hyperparameters (ADJUSTED FOR 9.4 POINT GAP)
BATCH_SIZE = 32
NUM_EPOCHS = 4              # More epochs due to significant gap
LEARNING_RATE = 5e-5        # Standard fine-tuning LR
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
MAX_GRAD_NORM = 1.0

# Model architecture (unchanged)
D_MODEL = 256
NUM_HEADS = 8
NUM_LAYERS = 4
D_FF = 1024
DROPOUT = 0.15
MAX_LEN = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Datasets
train_dataset = TranslationDataset(dataset["train"], tokenizer, max_len=MAX_LEN)
val_dataset = TranslationDataset(dataset["validation"], tokenizer, max_len=MAX_LEN)
test_dataset = TranslationDataset(dataset["test"], tokenizer, max_len=MAX_LEN)

# Dataloaders
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
    num_workers=2
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
    num_workers=2
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
    num_workers=2
)

# Model
model = TransformerTranslator(
    vocab_size=tokenizer.vocab_size,
    d_model=D_MODEL, num_heads=NUM_HEADS,
    num_encoder_layers=NUM_LAYERS,
    num_decoder_layers=NUM_LAYERS,
    d_ff=D_FF, max_len=MAX_LEN,
    dropout=DROPOUT,
    pad_idx=tokenizer.pad_token_id
).to(device)

# Load pre-trained checkpoint
checkpoint = torch.load(
    "/kaggle/input/btl-nlp-1-general/best_medical_translator.pt",
    map_location=device
)

if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"‚úì Loaded pre-trained checkpoint")
    print(f"  General training val loss: {checkpoint.get('val_loss', 'N/A')}")
else:
    model.load_state_dict(checkpoint)

model.to(device)
print(f"Model Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Loss with label smoothing
criterion = nn.CrossEntropyLoss(
    ignore_index=tokenizer.pad_token_id,
    label_smoothing=0.1
)

# Optimizer
optimizer = AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    betas=(0.9, 0.999),
    eps=1e-9,
    weight_decay=WEIGHT_DECAY
)

# Scheduler with warmup
num_training_samples = len(train_dataset)
steps_per_epoch = num_training_samples // BATCH_SIZE
total_steps = steps_per_epoch * NUM_EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

print(f"\n{'='*60}")
print("TRAINING CONFIGURATION")
print(f"{'='*60}")
print(f"Training samples:  {num_training_samples:,}")
print(f"Steps per epoch:   {steps_per_epoch:,}")
print(f"Total steps:       {total_steps:,}")
print(f"Warmup steps:      {warmup_steps:,}")
print(f"Learning rate:     {LEARNING_RATE:.0e}")
print(f"Batch size:        {BATCH_SIZE}")
print(f"Epochs:            {NUM_EPOCHS}")
print(f"{'='*60}\n")



Using device: cuda
‚úì Loaded pre-trained checkpoint
  General training val loss: 1.7248256816420444
Model Parameters: 38,132,800

TRAINING CONFIGURATION
Training samples:  304,286
Steps per epoch:   9,508
Total steps:       38,032
Warmup steps:      3,803
Learning rate:     5e-05
Batch size:        32
Epochs:            4



In [20]:
def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - warmup_steps)))

scheduler = LambdaLR(optimizer, lr_lambda)

# Baseline: Test pre-trained model on medical data
print("="*60)
print("BASELINE: TESTING PRE-TRAINED MODEL ON MEDICAL DATA")
print("="*60)
baseline_bleu = compute_bleu(model, test_loader, tokenizer, device)
print(f"Pre-trained Medical BLEU: {baseline_bleu:.2f}")
print(f"Target after fine-tuning: {baseline_bleu + 8:.2f} - {baseline_bleu + 12:.2f}")
print(f"{'='*60}\n")

# Training loop
best_val_loss = float("inf")
best_medical_bleu = baseline_bleu  # Start from baseline
patience_counter = 0
patience = 3  # Increased patience since we expect continuous improvement
global_step = 0

for epoch in range(NUM_EPOCHS):
    print("\n" + "="*60)
    print(f"EPOCH {epoch + 1}/{NUM_EPOCHS}")
    print("="*60)
    
    # Training
    model.train()
    total_loss = 0
    
    for batch_idx, batch in enumerate(train_loader):
        src = batch["src"].to(device)
        tgt = batch["tgt"].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(src, tgt[:, :-1])
        
        # Calculate loss
        output = output.contiguous().view(-1, output.size(-1))
        tgt_out = tgt[:, 1:].contiguous().view(-1)
        loss = criterion(output, tgt_out)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_GRAD_NORM)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        global_step += 1
        
        # Log progress every 500 steps
        if (batch_idx + 1) % 500 == 0:
            current_lr = scheduler.get_last_lr()[0]
            avg_loss = total_loss / (batch_idx + 1)
            print(f"  Step {global_step:,}/{total_steps:,} | "
                  f"Loss: {loss.item():.4f} | "
                  f"Avg: {avg_loss:.4f} | "
                  f"LR: {current_lr:.2e}")
    
    train_loss = total_loss / len(train_loader)
    
    # Validation
    val_loss = evaluate(model, val_loader, criterion, device)
    current_lr = scheduler.get_last_lr()[0]
    
    # Calculate BLEU on medical test set
    print("\nEvaluating BLEU score...")
    bleu_medical = compute_bleu(model, test_loader, tokenizer, device)
    
    # Monitor performance
    print(f"\n{'='*60}")
    print(f"EPOCH {epoch + 1} RESULTS")
    print(f"{'='*60}")
    print(f"Train Loss:     {train_loss:.4f}")
    print(f"Val Loss:       {val_loss:.4f}")
    print(f"Medical BLEU:   {bleu_medical:.2f}")
    print(f"Improvement:    {bleu_medical - baseline_bleu:+.2f} points from baseline")
    print(f"Learning Rate:  {current_lr:.2e}")
    
    # Progress indicator
    target_bleu = 38  # Match general performance
    progress = min(100, ((bleu_medical - baseline_bleu) / (target_bleu - baseline_bleu)) * 100)
    if progress > 0:
        print(f"Progress:       {progress:.1f}% toward target BLEU {target_bleu}")
    
    if bleu_medical < 32:
        print("Status:         ‚ö†Ô∏è  Needs more training")
    elif bleu_medical < 36:
        print("Status:         üìà Good progress")
    elif bleu_medical < 38:
        print("Status:         ‚úì  Near target")
    else:
        print("Status:         üéâ Excellent!")
    print(f"{'='*60}\n")
    
    # Sample translations
    print("Sample Medical Translations:")
    for i, s in enumerate(medical_examples[:3], 1):
        translation = translate_sentence(model, tokenizer, s, device)
        print(f"{i}. EN: {s}")
        print(f"   VI: {translation}\n")
    
    # Save best model based on BLEU
    if bleu_medical > best_medical_bleu:
        improvement = bleu_medical - best_medical_bleu
        best_medical_bleu = bleu_medical
        patience_counter = 0
        
        torch.save({
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "val_loss": val_loss,
            "train_loss": train_loss,
            "medical_bleu": bleu_medical,
            "baseline_bleu": baseline_bleu,
            "improvement": bleu_medical - baseline_bleu,
        }, "best_medical_finetuned.pt")
        
        print(f"‚úì Saved best model (BLEU: {bleu_medical:.2f}, +{improvement:.2f})")
    else:
        patience_counter += 1
        print(f"No BLEU improvement (patience: {patience_counter}/{patience})")
        
        if patience_counter >= patience:
            print(f"\n‚ö†Ô∏è  Early stopping at epoch {epoch + 1}")
            print(f"Best Medical BLEU: {best_medical_bleu:.2f}")
            break

# Final evaluation
print("\n" + "="*60)
print("FINE-TUNING COMPLETE!")
print("="*60)

# Load best model
checkpoint = torch.load("best_medical_finetuned.pt")
model.load_state_dict(checkpoint['model_state_dict'])

print(f"\nBest model from epoch {checkpoint['epoch']}")
print(f"  Medical BLEU:      {checkpoint['medical_bleu']:.2f}")
print(f"  Baseline BLEU:     {checkpoint['baseline_bleu']:.2f}")
print(f"  Total improvement: {checkpoint['improvement']:+.2f} points")
print(f"  Val Loss:          {checkpoint['val_loss']:.4f}")

# Final test set evaluation
print("\n" + "="*60)
print("FINAL EVALUATION")
print("="*60)
final_bleu = compute_bleu(model, test_loader, tokenizer, device)
print(f"Final Medical BLEU:    {final_bleu:.2f}")
print(f"Pre-trained BLEU:      {baseline_bleu:.2f}")
print(f"Improvement:           {final_bleu - baseline_bleu:+.2f} points")
print(f"General BLEU target:   38.0")

if final_bleu >= 38:
    print(f"\nüéâ SUCCESS! Matched/exceeded general performance!")
elif final_bleu >= 35:
    print(f"\n‚úì Good result! Close to general performance")
elif final_bleu >= 32:
    print(f"\nüìà Solid improvement, but room to grow")

print("="*60)

BASELINE: TESTING PRE-TRAINED MODEL ON MEDICAL DATA
Pre-trained Medical BLEU: 28.60
Target after fine-tuning: 36.60 - 40.60


EPOCH 1/4
  Step 500/38,032 | Loss: 4.5298 | Avg: 4.9805 | LR: 6.57e-06
  Step 1,000/38,032 | Loss: 4.4007 | Avg: 4.7309 | LR: 1.31e-05
  Step 1,500/38,032 | Loss: 4.0215 | Avg: 4.5045 | LR: 1.97e-05
  Step 2,000/38,032 | Loss: 3.6439 | Avg: 4.3464 | LR: 2.63e-05
  Step 2,500/38,032 | Loss: 4.0119 | Avg: 4.2235 | LR: 3.29e-05
  Step 3,000/38,032 | Loss: 3.2410 | Avg: 4.1200 | LR: 3.94e-05
  Step 3,500/38,032 | Loss: 3.6721 | Avg: 4.0337 | LR: 4.60e-05
  Step 4,000/38,032 | Loss: 3.3030 | Avg: 3.9587 | LR: 4.97e-05
  Step 4,500/38,032 | Loss: 3.3949 | Avg: 3.8945 | LR: 4.90e-05
  Step 5,000/38,032 | Loss: 3.0688 | Avg: 3.8388 | LR: 4.83e-05
  Step 5,500/38,032 | Loss: 3.1878 | Avg: 3.7884 | LR: 4.75e-05
  Step 6,000/38,032 | Loss: 3.0958 | Avg: 3.7447 | LR: 4.68e-05
  Step 6,500/38,032 | Loss: 3.1641 | Avg: 3.7049 | LR: 4.61e-05
  Step 7,000/38,032 | Loss: 3.1738

Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 504/504 [00:17<00:00, 29.15it/s]


Evaluating BLEU score...






EPOCH 1 RESULTS
Train Loss:     3.5304
Val Loss:       2.9319
Medical BLEU:   40.51
Improvement:    +11.92 points from baseline
Learning Rate:  4.17e-05
Progress:       100.0% toward target BLEU 38
Status:         üéâ Excellent!

Sample Medical Translations:
1. EN: A cross-sectional descriptive study was performed to assess the hearing status of armored tank soldiers.
   VI: nghi√™n c·ª©u m√¥ t·∫£ c·∫Øt ngang ƒë∆∞·ª£c th·ª±c hi·ªán nh·∫±m ƒë√°nh gi√° t√¨nh tr·∫°ng nghe c·ªßa c√°c binh sƒ© xe b·ªçc th√©p.

2. EN: Pherochromocytomas is a rare disease in children with the estimated incidence is about 1 per 50.000 to 100.000 children.
   VI: pherochromocytomas l√† m·ªôt b·ªánh hi·∫øm g·∫∑p ·ªü tr·∫ª em c√≥ t·ª∑ l·ªá ∆∞·ªõc t√≠nh kho·∫£ng 1 tr√™n 50,000 ƒë·∫øn 100,000 tr·∫ª em.

3. EN: The most common lesions are: frosted glass (91.5%), solidified (22.6%), interstitial thickening (14.2%).
   VI: c√°c t·ªïn th∆∞∆°ng ph·ªï bi·∫øn nh·∫•t l√†: h·ªôi ch·ª©ng gi·∫£m gi√° (91,5%), ƒë·∫∑c hi·ªáu 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Step 10,009/38,032 | Loss: 2.9372 | Avg: 3.0690 | LR: 4.09e-05
  Step 10,509/38,032 | Loss: 3.1597 | Avg: 3.0713 | LR: 4.02e-05
  Step 11,009/38,032 | Loss: 3.0062 | Avg: 3.0643 | LR: 3.95e-05
  Step 11,509/38,032 | Loss: 3.0953 | Avg: 3.0605 | LR: 3.87e-05
  Step 12,009/38,032 | Loss: 3.1527 | Avg: 3.0553 | LR: 3.80e-05
  Step 12,509/38,032 | Loss: 2.9949 | Avg: 3.0510 | LR: 3.73e-05
  Step 13,009/38,032 | Loss: 2.9143 | Avg: 3.0474 | LR: 3.66e-05
  Step 13,509/38,032 | Loss: 2.8249 | Avg: 3.0440 | LR: 3.58e-05
  Step 14,009/38,032 | Loss: 3.2429 | Avg: 3.0409 | LR: 3.51e-05
  Step 14,509/38,032 | Loss: 2.8653 | Avg: 3.0377 | LR: 3.44e-05
  Step 15,009/38,032 | Loss: 3.0110 | Avg: 3.0354 | LR: 3.36e-05
  Step 15,509/38,032 | Loss: 2.8685 | Avg: 3.0311 | LR: 3.29e-05
  Step 16,009/38,032 | Loss: 2.8930 | Avg: 3.0281 | LR: 3.22e-05
  Step 16,509/38,032 | Loss: 3.0310 | Avg: 3.0244 | LR: 3.14e-05
  Step 17,009/38,032 | Loss: 3.0538 | Avg: 3.0202 | LR: 3.07e-05
  Step 17,509/38,032 | Lo

Evaluating:   0%|          | 0/504 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 504/504 [00:17<00:00, 29.42it/s]


Evaluating BLEU score...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



EPOCH 2 RESULTS
Train Loss:     3.0080
Val Loss:       2.8105
Medical BLEU:   42.48
Improvement:    +13.89 points from baseline
Learning Rate:  2.78e-05
Progress:       100.0% toward target BLEU 38
Status:         üéâ Excellent!

Sample Medical Translations:
1. EN: A cross-sectional descriptive study was performed to assess the hearing status of armored tank soldiers.
   VI: nghi√™n c·ª©u m√¥ t·∫£ c·∫Øt ngang ƒë∆∞·ª£c th·ª±c hi·ªán ƒë·ªÉ ƒë√°nh gi√° t√¨nh tr·∫°ng nghe c·ªßa c√°c binh sƒ© xe tƒÉng b·ªçc th√©p.

2. EN: Pherochromocytomas is a rare disease in children with the estimated incidence is about 1 per 50.000 to 100.000 children.
   VI: pherochromocytomas l√† m·ªôt b·ªánh hi·∫øm g·∫∑p ·ªü tr·∫ª em c√≥ t·ª∑ l·ªá m·∫Øc ∆∞·ªõc t√≠nh kho·∫£ng 1/50.000 ƒë·∫øn 100.000 tr·∫ª em.

3. EN: The most common lesions are: frosted glass (91.5%), solidified (22.6%), interstitial thickening (14.2%).
   VI: c√°c t·ªïn th∆∞∆°ng ph·ªï bi·∫øn nh·∫•t l√†: k√≠nh m·ªù (91,5%), ƒë·∫∑c (22,6%), d√†y k·∫

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Step 19,518/38,032 | Loss: 2.7864 | Avg: 2.9262 | LR: 2.70e-05
  Step 20,018/38,032 | Loss: 2.8738 | Avg: 2.9292 | LR: 2.63e-05
  Step 20,518/38,032 | Loss: 3.0195 | Avg: 2.9274 | LR: 2.56e-05
  Step 21,018/38,032 | Loss: 3.0516 | Avg: 2.9250 | LR: 2.49e-05
  Step 21,518/38,032 | Loss: 3.3192 | Avg: 2.9229 | LR: 2.41e-05
  Step 22,018/38,032 | Loss: 2.9940 | Avg: 2.9218 | LR: 2.34e-05
  Step 22,518/38,032 | Loss: 3.0014 | Avg: 2.9213 | LR: 2.27e-05
  Step 23,018/38,032 | Loss: 2.7092 | Avg: 2.9202 | LR: 2.19e-05
  Step 23,518/38,032 | Loss: 3.1108 | Avg: 2.9188 | LR: 2.12e-05
  Step 24,018/38,032 | Loss: 3.0306 | Avg: 2.9166 | LR: 2.05e-05
  Step 24,518/38,032 | Loss: 3.2113 | Avg: 2.9140 | LR: 1.97e-05
  Step 25,018/38,032 | Loss: 2.9824 | Avg: 2.9126 | LR: 1.90e-05
  Step 25,518/38,032 | Loss: 2.9286 | Avg: 2.9116 | LR: 1.83e-05
  Step 26,018/38,032 | Loss: 2.9063 | Avg: 2.9099 | LR: 1.75e-05
  Step 26,518/38,032 | Loss: 2.8366 | Avg: 2.9092 | LR: 1.68e-05
  Step 27,018/38,032 | Lo

Evaluating:   0%|          | 0/504 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 504/504 [00:17<00:00, 29.10it/s]


Evaluating BLEU score...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



EPOCH 3 RESULTS
Train Loss:     2.9048
Val Loss:       2.7586
Medical BLEU:   43.20
Improvement:    +14.60 points from baseline
Learning Rate:  1.39e-05
Progress:       100.0% toward target BLEU 38
Status:         üéâ Excellent!

Sample Medical Translations:
1. EN: A cross-sectional descriptive study was performed to assess the hearing status of armored tank soldiers.
   VI: nghi√™n c·ª©u m√¥ t·∫£ c·∫Øt ngang ƒë∆∞·ª£c th·ª±c hi·ªán nh·∫±m ƒë√°nh gi√° t√¨nh tr·∫°ng nghe c·ªßa c√°c binh sƒ© xe tƒÉng.

2. EN: Pherochromocytomas is a rare disease in children with the estimated incidence is about 1 per 50.000 to 100.000 children.
   VI: b·ªánh nh√¢n c√≥ t·ª∑ l·ªá m·∫Øc b·ªánh h·ªìng c·∫ßu l√† 1/50.000 ƒë·∫øn 100.000 tr·∫ª em.

3. EN: The most common lesions are: frosted glass (91.5%), solidified (22.6%), interstitial thickening (14.2%).
   VI: c√°c t·ªïn th∆∞∆°ng ph·ªï bi·∫øn nh·∫•t l√†: k√≠nh m·ªù (91,5%), ƒë·∫∑c (22,6%), d√†y k·∫Ω (14,2%).

‚úì Saved best model (BLEU: 43.20, +0.72)

EPO

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Step 29,027/38,032 | Loss: 2.9173 | Avg: 2.8605 | LR: 1.32e-05
  Step 29,527/38,032 | Loss: 2.9808 | Avg: 2.8596 | LR: 1.24e-05
  Step 30,027/38,032 | Loss: 2.7277 | Avg: 2.8621 | LR: 1.17e-05
  Step 30,527/38,032 | Loss: 3.0465 | Avg: 2.8627 | LR: 1.10e-05
  Step 31,027/38,032 | Loss: 2.5584 | Avg: 2.8626 | LR: 1.02e-05
  Step 31,527/38,032 | Loss: 2.8380 | Avg: 2.8636 | LR: 9.50e-06
  Step 32,027/38,032 | Loss: 2.6524 | Avg: 2.8630 | LR: 8.77e-06
  Step 32,527/38,032 | Loss: 2.5356 | Avg: 2.8638 | LR: 8.04e-06
  Step 33,027/38,032 | Loss: 2.9771 | Avg: 2.8636 | LR: 7.31e-06
  Step 33,527/38,032 | Loss: 3.0038 | Avg: 2.8632 | LR: 6.58e-06
  Step 34,027/38,032 | Loss: 2.8095 | Avg: 2.8631 | LR: 5.85e-06
  Step 34,527/38,032 | Loss: 2.8809 | Avg: 2.8622 | LR: 5.12e-06
  Step 35,027/38,032 | Loss: 2.9554 | Avg: 2.8618 | LR: 4.39e-06
  Step 35,527/38,032 | Loss: 3.0201 | Avg: 2.8620 | LR: 3.66e-06
  Step 36,027/38,032 | Loss: 2.8054 | Avg: 2.8614 | LR: 2.93e-06
  Step 36,527/38,032 | Lo

Evaluating:   0%|          | 0/504 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 504/504 [00:17<00:00, 28.96it/s]


Evaluating BLEU score...



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



EPOCH 4 RESULTS
Train Loss:     2.8593
Val Loss:       2.7439
Medical BLEU:   43.44
Improvement:    +14.84 points from baseline
Learning Rate:  0.00e+00
Progress:       100.0% toward target BLEU 38
Status:         üéâ Excellent!

Sample Medical Translations:
1. EN: A cross-sectional descriptive study was performed to assess the hearing status of armored tank soldiers.
   VI: nghi√™n c·ª©u m√¥ t·∫£ c·∫Øt ngang ƒë∆∞·ª£c th·ª±c hi·ªán nh·∫±m ƒë√°nh gi√° t√¨nh tr·∫°ng nghe c·ªßa c√°c binh sƒ© xe tƒÉng.

2. EN: Pherochromocytomas is a rare disease in children with the estimated incidence is about 1 per 50.000 to 100.000 children.
   VI: pherochromocytomas l√† m·ªôt b·ªánh hi·∫øm g·∫∑p ·ªü tr·∫ª em c√≥ t·ª∑ l·ªá m·∫Øc ∆∞·ªõc t√≠nh kho·∫£ng 1/50.000 ƒë·∫øn 100.000 tr·∫ª em.

3. EN: The most common lesions are: frosted glass (91.5%), solidified (22.6%), interstitial thickening (14.2%).
   VI: c√°c t·ªïn th∆∞∆°ng ph·ªï bi·∫øn nh·∫•t l√†: k√≠nh m·ªù (91,5%), ƒë·∫∑c (22,6%), d√†y k·∫Ω (14,2%).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Medical BLEU:    43.44
Pre-trained BLEU:      28.60
Improvement:           +14.84 points
General BLEU target:   38.0

üéâ SUCCESS! Matched/exceeded general performance!


In [25]:
import random

def get_random_test_examples(dataset, n=50):
    indices = random.sample(range(len(dataset['test'])), n)
    return [dataset['test'][i]['en'] for i in indices]

# Example: Sample 5 random English sentences
medical_examples = get_random_test_examples(dataset, n=50)


# Sample translations
print("Sample Medical Translations:")
for i, s in enumerate(medical_examples, 1):
    translation = translate_sentence(model, tokenizer, s, device)
    print(f"{i}. EN: {s}")
    print(f"   VI: {translation}\n")


Sample Medical Translations:
1. EN: Treatment of foot drop after spinal anesthesia with intravenous lipid 20% emulsion: A case report
   VI: ƒëi·ªÅu tr·ªã gi·∫£m b√†n ch√¢n sau g√¢y t√™ tu·ª∑ s·ªëng b·∫±ng thu·ªëc m·ª° tƒ©nh m·∫°ch 20% em nh≈© t∆∞∆°ng: b√°o c√°o ca b·ªánh

2. EN: Setting up the suitable solutions in the reality made the active changes, hygiene and safe food conditions changed following to 10 standards of food sold on the street sides.
   VI: x√¢y d·ª±ng c√°c gi·∫£i ph√°p th√≠ch h·ª£p trong th·ª±c t·∫ø ƒë√£ l√†m thay ƒë·ªïi ho·∫°t ƒë·ªông, v·ªá sinh v√† ƒëi·ªÅu ki·ªán th·ª±c ph·∫©m an to√†n thay ƒë·ªïi theo 10 ti√™u chu·∫©n th·ª±c ph·∫©m ƒë∆∞·ª£c b√°n ·ªü c√°c b√™n ƒë∆∞·ªùng ph·ªë.

3. EN: Lorcaserin (not available in the US) suppresses appetite via selective agonism of serotonin 2C (5-HT2C) brain receptors.
   VI: lorcaserin (kh√¥ng c√≥ ·ªü m·ªπ) ·ª©c ch·∫ø s·ª± th√®m ƒÉn th√¥ng qua s·ª± ch·ªß v·∫≠n ch·ªçn l·ªçc c·ªßa th·ª• th·ªÉ n√£o serotonin 2c (5-ht 2c).

4. EN: Th

In [29]:
translation = translate_sentence(model, tokenizer, "Even with Dellacqua's retirement in early 2018, Barty has also continued to improve at doubles, having since won the five biggest doubles titles of her career, including her first Grand Slam title at the 2018 US Open.", device)
print(translation)

ngay c·∫£ khi ngh·ªâ h∆∞u v√†o ƒë·∫ßu nƒÉm 2018, barty c≈©ng ƒë√£ ti·∫øp t·ª•c c·∫£i thi·ªán ·ªü hai nh√≥m, t·ª´ ƒë√≥ ƒë√£ c√≥ 5 l·∫ßn ƒë·∫ßu ti√™n ƒë√°nh gi√° v·ªÅ s·ª± nghi·ªáp c·ªßa m√¨nh, bao g·ªìm c·∫£ ti√™u chu·∫©n grand) ƒë·∫ßu ti√™n c·ªßa c√¥ t·∫°i m·ªπ m·ªü.
