In [1]:
pip install transformers torch sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -U transformers


Note: you may need to restart the kernel to use updated packages.


In [48]:
import pandas as pd
from datasets import Dataset

dataset = Dataset.from_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv') 

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [49]:
from datasets import DatasetDict

# Assume dataset is already a DatasetDict with at least 'train'
split = dataset.train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': split['train'],
    'validation': split['test']
})


dataset['train'] = dataset['train'].select(range(min(200000, len(dataset['train']))))


In [50]:
dataset['validation'] = dataset['validation'].select(range(min(100000, len(dataset['validation']))))

In [51]:
from transformers import MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    # Ensure inputs are strings and handle any None values
    inputs = [str(text) if text is not None else "" for text in examples['en']]
    targets = [str(text) if text is not None else "" for text in examples['fr']]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=64, 
        truncation=True, 
        padding='max_length',
        return_tensors=None  # Don't return tensors yet
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=64, 
            truncation=True, 
            padding='max_length',
            return_tensors=None  # Don't return tensors yet
        )
    
    # Replace padding tokens in labels with -100
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Re-apply preprocessing
print("Re-processing dataset...")
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

# Verify the data format
print("Sample processed data:")
print(tokenized_datasets['train'][0])
print("Keys:", tokenized_datasets['train'].column_names)


Re-processing dataset...


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Sample processed data:
{'input_ids': [97, 669, 10192, 1034, 545, 176, 33, 58, 48892, 9, 12, 57, 64, 397, 4, 227, 272, 228, 45, 264, 30, 4, 819, 7, 15926, 10, 280, 45, 881, 64, 860, 3038, 766, 0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [277, 14, 1, 426, 307, 2, 19, 1070, 3825, 221, 29, 153, 16, 14767, 89, 5162, 59, 9271, 89, 17, 8, 952, 1083, 4618, 70, 9141, 36, 19, 7773, 11, 70, 15469, 5, 454, 3038, 422, 1958, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
Keys: ['input_ids', 'attention_ma

In [52]:
from transformers import MarianMTModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = MarianMTModel.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Optimized training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    num_train_epochs=1,
    max_steps=30,  # Short but visible training
    logging_steps=5,  # Log every 5 steps
    save_strategy='no',  # Don't save to speed up
    eval_strategy='no',  # Skip evaluation for speed
    disable_tqdm=False,  # Keep progress bars
    fp16=True,
    dataloader_num_workers=0,
)


In [53]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Manual training with guaranteed progress display
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# Create small dataloader for testing
small_dataset = tokenized_datasets['train'].select(range(50))
train_dataloader = DataLoader(small_dataset, batch_size=8, collate_fn=data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

print("=== Manual Training Started ===")
total_steps = 10  # Limit steps for quick testing

with tqdm(total=total_steps, desc="Training") as pbar:
    step_count = 0
    for epoch in range(1):
        epoch_loss = 0
        for batch in train_dataloader:
            if step_count >= total_steps:
                break
                
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            # Update progress
            step_count += 1
            epoch_loss += loss.item()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
            pbar.update(1)
            
            # Print every few steps
            if step_count % 3 == 0:
                print(f"Step {step_count}/{total_steps} - Loss: {loss.item():.4f}")

print(f"=== Training Completed - Average Loss: {epoch_loss/step_count:.4f} ===")


=== Manual Training Started ===


Training:  40%|████      | 4/10 [00:00<00:00,  8.78it/s, loss=1.1448]

Step 3/10 - Loss: 2.0706


Training:  70%|███████   | 7/10 [00:00<00:00,  9.00it/s, loss=0.9985]

Step 6/10 - Loss: 0.7560
=== Training Completed - Average Loss: 1.6201 ===





In [61]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

chencherry = SmoothingFunction()

def compute_bleu(reference, candidate):
    return sentence_bleu([reference], candidate, smoothing_function=chencherry.method4)

# Example: Generate and evaluate
inputs = tokenizer("My father's name is Jahid Ali  ", return_tensors="pt", padding=True).to(model.device)
translated = model.generate(**inputs)
output = tokenizer.decode(translated[0], skip_special_tokens=True)
reference = "Mon père s'appelle Jahid Ali.".split()
candidate = output.split()
bleu = compute_bleu(reference, candidate)
print("BLEU Score:", bleu)
print(output)

BLEU Score: 1.0
Mon père s'appelle Jahid Ali.
