# PATH CONFIG

In [1]:
import os

print(os.getcwd())
if not os.getcwd().endswith("app"):
    os.chdir("../app")
    print(os.getcwd())

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

%load_ext autoreload
%autoreload 2
# %matplotlib inline

/home/turbotowerlnx/Documents/Master/TA/TA-Spanish-Esperanto-Translator/notebooks
/home/turbotowerlnx/Documents/Master/TA/TA-Spanish-Esperanto-Translator/app


In [2]:
from src.config import Configuration

CONFIG = Configuration(
    model_name="facebook/nllb-200-distilled-600M",
    src_code = "spa_Latn",
    tgt_code = "epo_Latn"
)

# Dataset

In [3]:
df_corpus_clean = pd.read_csv(CONFIG.corpus_path)
df_corpus_clean.rename(columns={
    CONFIG.src_name: CONFIG.src_code, 
    CONFIG.tgt_name: CONFIG.tgt_code
}, inplace=True)

# Shuffle the dataframe first
df_shuffled = df_corpus_clean.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate split indices
n_total = len(df_shuffled)
n_test = int(n_total * CONFIG.test_split)
n_val = int(n_total * CONFIG.val_split)

# Split the data
df_test = df_shuffled[:n_test].reset_index(drop=True)
df_val = df_shuffled[n_test:n_test + n_val].reset_index(drop=True)
df_train = df_shuffled[n_test + n_val:].reset_index(drop=True)

print(f"Dataset sizes:")
print(f"  Train: {len(df_train)} ({len(df_train)/n_total*100:.1f}%)")
print(f"  Val:   {len(df_val)} ({len(df_val)/n_total*100:.1f}%)")
print(f"  Test:  {len(df_test)} ({len(df_test)/n_total*100:.1f}%)")
print(f"  Total: {n_total}")

Dataset sizes:
  Train: 4019270 (70.0%)
  Val:   861272 (15.0%)
  Test:  861272 (15.0%)
  Total: 5741814


In [4]:
from transformers import AutoTokenizer

from src.data import TranslationDataset


tokenizer = AutoTokenizer.from_pretrained(
    CONFIG.model_name, 
    padding=True, 
    pad_to_multiple_of=8, 
    src_lang=CONFIG.src_code, 
    tgt_lang=CONFIG.tgt_code, 
    truncation=True, 
    max_length=CONFIG.max_tok_length,
)

dataloader_train = TranslationDataset(
    df_train,
    tokenizer,
    CONFIG.src_code,
    CONFIG.tgt_code,
    CONFIG.max_tok_length,
)
dataloader_val = TranslationDataset(
    df_val,
    tokenizer,
    CONFIG.src_code,
    CONFIG.tgt_code,
    CONFIG.max_tok_length,
)
dataloader_test = TranslationDataset(
    df_test,
    tokenizer,
    CONFIG.src_code,
    CONFIG.tgt_code,
    CONFIG.max_tok_length,
)

  from .autonotebook import tqdm as notebook_tqdm


# Load transformer

In [5]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)


model = AutoModelForSeq2SeqLM.from_pretrained(
    CONFIG.model_name,
    quantization_config=quantization_config
)


# Evaluation

In [6]:
from evaluate import load

metric_bleu = load("sacrebleu")
metric_comet = load("comet")

  from pkg_resources import DistributionNotFound, get_distribution
Fetching 5 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:00<00:00, 96642.95it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
Encoder model frozen.
/home/turbotowerlnx/Documents/Master/TA/TA-Spanish-Esperanto-Translator/venv/lib/python3.12/site-packages/pytorch_lightni

In [7]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels, sources = eval_preds

    # Convert to lists if coming from a datasets.Column
    if not isinstance(labels, list):
        labels = list(labels)
        
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace negative ids in the labels as we can't decode them.
    labels = [
        [tokenizer.pad_token_id if j < 0 else j for j in label]
        for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Decode sources
    decoded_sources = tokenizer.batch_decode(sources, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result_blue = metric_bleu.compute(
        predictions=decoded_preds, 
        references=decoded_labels
    )
    result_comet = metric_comet.compute(
        sources=decoded_sources,
        predictions=decoded_preds, 
        references=[label[0] for label in decoded_labels]  # COMET expects flat list, not nested
    )
    result = {
        "bleu": result_blue["score"],
        "comet": result_comet["mean_score"]
    }

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Inference

In [8]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained(
    CONFIG.model_name,
)

print(generation_config)

GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "eos_token_id": 2,
  "max_length": 200,
  "pad_token_id": 1
}



In [9]:
from torch.utils.data import DataLoader

test_batch_size = 32
test_loader = DataLoader(dataloader_test, batch_size=test_batch_size, shuffle=False)

In [10]:
output_sequences = []
all_labels = []
all_sources = []

for i, batch in enumerate(test_loader):
    # Store source input_ids for later decoding
    all_sources.extend(batch['input_ids'].cpu())
    
    # Generate translations
    with torch.no_grad():    
        output_batch = model.generate(
            generation_config=generation_config, 
            input_ids=batch['input_ids'].cuda(), 
            attention_mask=batch['attention_mask'].cuda(), 
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(CONFIG.tgt_code), 
            max_length=CONFIG.max_tok_length, 
            num_beams=1, 
            do_sample=False,
        )
    output_sequences.extend(output_batch.cpu())
    all_labels.extend(batch['labels'].cpu())
    
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{len(test_loader)} batches")
    if i >= CONFIG.max_batches:
        break



Processed 10/26915 batches
Processed 20/26915 batches
Processed 20/26915 batches
Processed 30/26915 batches
Processed 30/26915 batches
Processed 40/26915 batches
Processed 40/26915 batches
Processed 50/26915 batches
Processed 50/26915 batches
Processed 60/26915 batches
Processed 60/26915 batches
Processed 70/26915 batches
Processed 70/26915 batches
Processed 80/26915 batches
Processed 80/26915 batches
Processed 90/26915 batches
Processed 90/26915 batches
Processed 100/26915 batches
Processed 100/26915 batches
Processed 110/26915 batches
Processed 110/26915 batches
Processed 120/26915 batches
Processed 120/26915 batches
Processed 130/26915 batches
Processed 130/26915 batches
Processed 140/26915 batches
Processed 140/26915 batches
Processed 150/26915 batches
Processed 150/26915 batches
Processed 160/26915 batches
Processed 160/26915 batches
Processed 170/26915 batches
Processed 170/26915 batches
Processed 180/26915 batches
Processed 180/26915 batches
Processed 190/26915 batches
Processed

In [11]:
# Compute metrics
result = compute_metrics((output_sequences, all_labels, all_sources))
print(f'BLEU score: {result["bleu"]}')
print(f'COMET score: {result["comet"]}')

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
  return _C._get_float32_matmul_precision()
You are using a CUDA device ('NVIDIA GeForce RTX 5070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
  return _C._get_float32_matmul_precision()
You are using a CUDA device ('NVIDIA GeForce RTX 5070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32

BLEU score: 19.4891
COMET score: 0.7691
