In [1]:
pip install transformers torch sentencepiece


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [2]:
pip install -U transformers


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.52.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from datasets import Dataset

dataset = Dataset.from_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv') 

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [4]:
from datasets import DatasetDict

##splitting dataset in test and train sets
split = dataset.train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': split['train'],
    'validation': split['test']
})


dataset['train'] = dataset['train'].select(range(min(200000, len(dataset['train']))))
dataset['validation'] = dataset['validation'].select(range(min(100000, len(dataset['validation']))))

In [5]:
from transformers import MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    ##Ensuring inputs are string and handling the non values
    inputs = [str(text) if text is not None else "" for text in examples['en']]
    targets = [str(text) if text is not None else "" for text in examples['fr']]
    
    ##Tokenizing the inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=64, 
        truncation=True, 
        padding='max_length',
        return_tensors=None
    )
    
    ## Tokenizing the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=64, 
            truncation=True, 
            padding='max_length',
            return_tensors=None  
        )
    
    ## Replacing padding token with -100 in labels
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

## Preprocessing 2
print("Re-processing dataset -.-.-.")
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

## Verifing data format
print("Sample processed data:")
print(tokenized_datasets['train'][0])
print("Keys:", tokenized_datasets['train'].column_names)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



Re-processing dataset -.-.-.


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Sample processed data:
{'input_ids': [1671, 3, 60, 6274, 226, 23740, 6, 9, 748, 32, 5523, 30, 35508, 33342, 442, 18, 12415, 10, 1057, 29879, 3509, 48, 276, 48, 18, 31433, 7068, 10, 19, 4096, 27039, 3, 0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [6929, 3670, 95, 37079, 36, 14, 6, 1078, 13, 26219, 30050, 2651, 31, 19, 252, 22, 3509, 22, 1250, 11, 5, 8, 33107, 2, 11, 36, 16, 14, 7131, 14270, 3894, 11, 8, 19, 5229, 15631, 3, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
Keys: ['input_id

In [6]:
from transformers import MarianMTModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = MarianMTModel.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    num_train_epochs=1,
    max_steps=30,  
    logging_steps=5,  
    save_strategy='no',
    eval_strategy='no',  
    disable_tqdm=False,  
    fp16=True,
    dataloader_num_workers=0,
)


2025-06-21 10:08:22.238986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750500502.809452      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750500502.944167      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

## Manual training 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

## Small dataloader for testing
small_dataset = tokenized_datasets['train'].select(range(50))
train_dataloader = DataLoader(small_dataset, batch_size=8, collate_fn=data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

print("=== Manual Training Started ===")
total_steps = 10

with tqdm(total=total_steps, desc="Training") as pbar:
    step_count = 0
    for epoch in range(1):
        epoch_loss = 0
        for batch in train_dataloader:
            if step_count >= total_steps:
                break
                
            ## Moves batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            ## Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            ## Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            ## Update progress
            step_count += 1
            epoch_loss += loss.item()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
            pbar.update(1)
            
            ## Print every few steps
            if step_count % 3 == 0:
                print(f"Step {step_count}/{total_steps} - Loss: {loss.item():.4f}")

print(f"=== Training Completed - Average Loss: {epoch_loss/step_count:.4f} ===")


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

=== Manual Training Started ===



Training:   0%|          | 0/10 [00:00<?, ?it/s][A
Training:   0%|          | 0/10 [00:01<?, ?it/s, loss=1.6320][A
Training:  10%|█         | 1/10 [00:01<00:11,  1.27s/it, loss=1.6320][A
Training:  10%|█         | 1/10 [00:01<00:11,  1.27s/it, loss=1.8307][A
Training:  20%|██        | 2/10 [00:01<00:04,  1.63it/s, loss=1.8307][A
Training:  20%|██        | 2/10 [00:01<00:04,  1.63it/s, loss=2.1278][A
Training:  30%|███       | 3/10 [00:01<00:02,  2.52it/s, loss=2.1278][A
Training:  30%|███       | 3/10 [00:01<00:02,  2.52it/s, loss=2.6585][A
Training:  40%|████      | 4/10 [00:01<00:01,  3.56it/s, loss=2.6585][A

Step 3/10 - Loss: 2.1278



Training:  40%|████      | 4/10 [00:01<00:01,  3.56it/s, loss=1.7777][A
Training:  50%|█████     | 5/10 [00:01<00:01,  4.56it/s, loss=1.7777][A
Training:  50%|█████     | 5/10 [00:01<00:01,  4.56it/s, loss=2.1026][A
Training:  60%|██████    | 6/10 [00:01<00:00,  5.56it/s, loss=2.1026][A
Training:  60%|██████    | 6/10 [00:01<00:00,  5.56it/s, loss=0.9897][A
Training:  70%|███████   | 7/10 [00:01<00:00,  3.50it/s, loss=0.9897][A

Step 6/10 - Loss: 2.1026
=== Training Completed - Average Loss: 1.8741 ===





In [17]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

chencherry = SmoothingFunction()

def compute_bleu(reference, candidate):
    return sentence_bleu([reference], candidate, smoothing_function=chencherry.method4)


inputs = tokenizer('''I didn't expect the big departure. Close the door without coming back. I know, you don't understand. My life, my choices, and the desire to touch the future.
No, I haven't forgotten you, I won't say it.
But when I think about it, my chest dances.
So, forgive me for all those nights when.
I prayed to be left alone.''', return_tensors="pt", padding=True).to(model.device)
translated = model.generate(**inputs)
output = tokenizer.decode(translated[0], skip_special_tokens=True)
reference = '''Je ne m' attendais pas à un grand départ.
Fermez la porte sans revenir.
Je sais, vous ne comprenez pas.
Ma vie, mes choix, et le désir de toucher l'avenir.
Non, je ne vous ai pas oublié, je ne le dirai pas.
Mais quand j'y pense, ma poitrine danse.
Alors, pardonne-moi pour tous ces soirs où
J'ai prié d'être laissé seul.'''.split()
candidate = output.split()
bleu = compute_bleu(reference, candidate)
print("BLEU Score:", bleu)
print(output)

BLEU Score: 0.6811146617502504
Je ne m'attendais pas au grand départ. Fermez la porte sans revenir. Je sais, vous ne comprenez pas. Ma vie, mes choix, et le désir de toucher l'avenir. Non, je ne vous ai pas oublié, je ne le dirai pas. Mais quand j'y pense, mes danses de poitrine. Alors, pardonnez-moi pour toutes ces nuits où. J'ai prié d'être seule.
