# Imports

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
import torch
import sacrebleu
from tqdm import tqdm
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import evaluate
import sacrebleu
from transformers.tokenization_utils import AddedToken

  from .autonotebook import tqdm as notebook_tqdm


[2025-05-19 15:31:24,603] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


# Constants

In [2]:
SPANISH_VAL_FILE = 'datasets/dev.es.txt'
WAYUU_VAL_FILE = 'datasets/dev.guc.txt'
MODEL_PATH = 'models/nllb_wayuu_esp_completo_1_3B-V2'
TOKENIZER_PATH = 'models/nllb_wayuu_esp_completo_1_3B-V2' # Didnt save it oops
SRC_LANG = "spa_Latn"
TGT_LANG = "way_Latn"

In [3]:
EVALUATION_PATH = "nllb_original_evaluation"

In [4]:
BATCH_SIZE = 64

In [5]:
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.8
TOP_P = 0.95

# Utils

In [6]:
class TextDataset(Dataset):
    def __init__(self, spa_path, wayuu_path):
        with open(spa_path, 'r', encoding='utf-8') as f:
            self.spa_lines = [line.strip() for line in f if line.strip()]

        with open(wayuu_path, 'r', encoding='utf-8') as f:
            self.wayuu_lines = [line.strip() for line in f if line.strip()]

    def __len__(self):
        return len(self.spa_lines)

    def __getitem__(self, idx):
        spa = self.spa_lines[idx]
        wayuu = self.wayuu_lines[idx]
        
        return spa, wayuu

In [7]:
def get_policy_model(model_name, tokenizer_name, src_lang, tgt_lang):

    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, src_lang=src_lang)#, tgt_lang=tgt_lang)
    tokenizer.add_tokens(AddedToken(tgt_lang, normalized=False, special=True))
    return model, tokenizer

In [8]:
def get_rewards_translation(generations, correct_translations):

    bleu = sacrebleu.BLEU(effective_order = True)
    def get_bleu_score(sample, correct_translation):
        # Compute bleu score for each sample. 
        # Bleu score normalized to [0, 1]
        return bleu.sentence_score(sample, 
                                   [correct_translation]
                                   ).score

    answer_bleu_scores = [
        get_bleu_score(sample, translation)
        for sample, translation in zip(generations, correct_translations)
    ]
    
    return answer_bleu_scores

In [9]:
def generate_batch_completion(model, tokenizer, prompts: list, return_ids=False, **kwargs):
    default_sampling_args = {
        'do_sample': True, # FIXME not enough memory in local
        'max_new_tokens': 512,
        'temperature': 0.8,
        'top_p': 0.95, # FIXME not enough memory in local
    }
    default_sampling_args.update(kwargs)

    model_inputs = tokenizer(prompts, padding='longest', padding_side='left', \
        return_tensors="pt").to(model.device) # No VLLM
    
    outputs = model.generate(
        inputs=model_inputs.input_ids,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids("way_Latn"), # FIXME convert to param
        **default_sampling_args
    ) # Generation no VLLM

    if return_ids:
        generation_ids = [model_inputs.input_ids.tolist()[0] + list(output) for output in outputs.tolist()]  # Diferent tokenizer model.inputs
        # padding the generation_ids to the max length
        max_length = max([len(ids) for ids in generation_ids])
        generation_ids = [ids + [tokenizer.pad_token_id]*(max_length-len(ids)) for ids in generation_ids]
        generation_ids = torch.tensor(generation_ids)
        return generation_ids, len(model_inputs.input_ids[0])

    completions = tokenizer.batch_decode(outputs, skip_special_tokens=True) # No text in outputs had to tokenize decode
    return completions

In [10]:
def evaluate_model(model, tokenizer, dataloader, temperature, top_p, max_new_tokens):
    prompts = []
    golds = []
    generations = []
    bleu_scores = []
    for inputs, targets in tqdm(dataloader):
        generation = generate_batch_completion(
            model,
            tokenizer,
            inputs,
            return_ids=False,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )
        bleu_score = get_rewards_translation(generation, targets)
        golds.extend(targets)
        prompts.extend(inputs)
        generations.extend(generation)
        bleu_scores.extend(bleu_score)
    avg_bleu_score = sum(bleu_scores)/len(bleu_scores)
    df_results = pd.DataFrame({
        "input": prompts,
        "target": golds,
        "generation": generations,
        "scores": bleu_scores
    })
    df_avg_bleu_score = pd.DataFrame({"avg_bleu_score": [avg_bleu_score]})
    return df_results, df_avg_bleu_score

# Evaluation

## Load dataset

In [11]:
dataset = TextDataset(SPANISH_VAL_FILE, WAYUU_VAL_FILE)

In [12]:
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

## Load model

In [13]:
model, tokenizer = get_policy_model(MODEL_PATH, TOKENIZER_PATH, SRC_LANG, TGT_LANG)

In [14]:
len(dataloader), 104*BATCH_SIZE

(104, 6656)

## Evaluate model

In [15]:
df_evaluation, df_avg_bleu_score = evaluate_model(model, tokenizer, dataloader, TEMPERATURE, TOP_P, MAX_NEW_TOKENS)

 18%|███████████████████████████████                                                                                                                                           | 19/104 [02:14<08:46,  6.20s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1566 > 1024). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [18:42<00:00, 10.79s/it]


In [16]:
df_avg_bleu_score

Unnamed: 0,avg_bleu_score
0,8.031024


In [17]:
df_evaluation.head(10)

Unnamed: 0,input,target,generation,scores
0,se me empezaron a quitar las ganas de fumar,nnojoluitpa suchuntaain taa'in akamüjaa,müsüjese'e nnojoluinjatüin kapüleein akumajaa ...,1.379446
1,como deberiamos ver la ley de jehova sobre la ...,jamüsü kojutüinjatka wamüin tü ishakat ma'aka ...,süpüla watüjaain saa'u jamüinjatüin sukumajia ...,3.300809
2,"Con él va Onésimo, paisano de ustedes, no meno...","Chi o'uneechikai nümaa Tíquico, nia wane juwal...","Otta Tíquico, aishije'echi ma'i pia nüpüla. Ot...",5.352302
3,pero noemi estaba decidida a llegar a israel,ayatapaja'a noemi o'unüin israelmüin,o'unüsü shia israelmüin süka süntüinjatüin sün...,1.241494
4,me sentia culpable por no poderles dar a mis h...,anuujese'e sünüiki janet shapaasü ma'in taa'in...,talatirüin toulia tü tamakat namüin na tepichi...,1.727224
5,una cualidad cristiana mas valiosa que los dia...,tü akuwa'ipaa kojutüleekat suuliale'eya wanee ...,tü palajatkat shiyaawase eejatüin sukuwa'ipa j...,4.196115
6,con este tratado denunciaron sin temor a la cr...,jee aküjünüsü tü shiimainkat nachiki na anouja...,otta tü karalouktakat la historia historica de...,4.266332
7,los enemigos de dios llevan dos mil a os hacie...,so'u yaajachin jesuu yaa mmapa'a nachajaain ma...,kakaliashaatasü ma'in naa'in na nü'ünüükana ma...,13.973537
8,con razon jesucristo pregunto quien de ustedes...,shiimainya ma'in tü nümakat jesuu wanaa sümaa ...,wanaa sümaa naapüin je'waa tü kataakalü o'u sü...,12.049515
9,mi huerta,ta apainse,müsia tü eekai anain süpülajatü tü eekai anain...,0.0


In [18]:
df_evaluation.to_csv(f"{EVALUATION_PATH}/bleu_scores.csv", index=False)

In [19]:
df_avg_bleu_score.to_csv(f"{EVALUATION_PATH}/avg_bleu_score.csv", index=False)