In [1]:
pip install peft

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# 1 Setup
import sys
import os
sys.path.append(os.path.abspath("."))

import comet
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
import torch.nn.functional as F
import math
import matplotlib as plt

from engfra_bad_gpt_model import generate_translation, load_finetuned_model
from transformers import AutoTokenizer

model = load_finetuned_model()
tokenizer = AutoTokenizer.from_pretrained("./finetuned_lora_model_engfra_low")

PyTorch version 2.6.0+cu126 available.


Loading dataset...
LoRA applied modules and trainable parameters:
trainable params: 3,145,728 || all params: 562,360,320 || trainable%: 0.5594

LoRA layers applied:
base_model.model.transformer.h.0.self_attention.query_key_value.lora_dropout -> ModuleDict
base_model.model.transformer.h.0.self_attention.query_key_value.lora_dropout.default -> Dropout
base_model.model.transformer.h.0.self_attention.query_key_value.lora_A -> ModuleDict
base_model.model.transformer.h.0.self_attention.query_key_value.lora_A.default -> Linear
base_model.model.transformer.h.0.self_attention.query_key_value.lora_B -> ModuleDict
base_model.model.transformer.h.0.self_attention.query_key_value.lora_B.default -> Linear
base_model.model.transformer.h.0.self_attention.query_key_value.lora_embedding_A -> ParameterDict
base_model.model.transformer.h.0.self_attention.query_key_value.lora_embedding_B -> ParameterDict
base_model.model.transformer.h.0.self_attention.query_key_value.lora_magnitude_vector -> ModuleDict
base

Map:   0%|          | 0/2009 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss,Validation Loss
20,1.5997,1.558584
40,1.5539,1.460701
60,1.3689,1.45308
80,1.5086,1.434866
100,1.397,1.430604
120,1.3595,1.421253
140,1.4444,1.416487
160,1.4767,1.413476
180,1.4549,1.409141
200,1.335,1.407048


Model saved!
Training log updated.


Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 1077989376 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.
Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 1078513664 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.


In [None]:
# 2 Data
dataset = load_dataset("Muennighoff/flores200", "all", revision="refs/pr/7", trust_remote_code=True)
dev_set = dataset["dev"]
lang_pairs = {
    "fra-eng": dev_set.filter(lambda x: x["sentence_fra_Latn"] and x["sentence_eng_Latn"]),
    "eng-fra": dev_set.filter(lambda x: x["sentence_eng_Latn"] and x["sentence_fra_Latn"])
}

In [None]:
# 3 BLEU and METEOR
import sacrebleu

def compute_bleu(predictions, references):
    if isinstance(predictions, str):
        predictions = [predictions]
    if isinstance(references[0], str):
        references = [[ref] for ref in references]

    scores = []
    for pred, ref in zip(predictions, references):
        score = sacrebleu.sentence_bleu(pred, ref).score
        scores.append(score)
    return scores

from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
# nltk.download('wordnet')
# nltk.download('omw-1.4')

def compute_meteor(predictions, references):
    if isinstance(predictions, str):
        predictions = [predictions]
    if isinstance(references, str):
        references = [references]

    scores = []
    for pred, ref in zip(predictions, references):
        score = meteor_score([word_tokenize(ref)], word_tokenize(pred))
        scores.append(score)
    return scores

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gerri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gerri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# 4 COMET
from comet import download_model, load_from_checkpoint

# Reference-based COMET
comet_ref_model_path = download_model("Unbabel/wmt22-comet-da")
comet_ref_model = load_from_checkpoint(comet_ref_model_path)

# Reference-free COMET
cometkiwi_model_path = download_model("Unbabel/wmt22-cometkiwi-da")
cometkiwi_model = load_from_checkpoint(cometkiwi_model_path)


# Safety check
if "comet_ref_model" not in globals():
    comet_ref_model_path = download_model("Unbabel/wmt22-comet-da")
    comet_ref_model = load_from_checkpoint(comet_ref_model_path)

if "cometkiwi_model" not in globals():
    cometkiwi_model_path = download_model("Unbabel/wmt22-cometkiwi-da")
    cometkiwi_model = load_from_checkpoint(cometkiwi_model_path)

# Compute COMET scores
def compute_comet_ref(srcs, mts, refs):
    try:
        data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(srcs, mts, refs)]
        score = comet_ref_model.predict(data, gpus=1 if torch.cuda.is_available() else 0)
        return score.scores
    except Exception as e:
        print(f"[COMET-REF ERROR] {e}")
        return [float("nan")] * len(srcs)

def compute_cometkiwi(srcs, mts):
    try:
        data = [{"src": s, "mt": m} for s, m in zip(srcs, mts)]
        score = cometkiwi_model.predict(data, gpus=1 if torch.cuda.is_available() else 0)
        return score.scores
    except Exception as e:
        print(f"[COMET-KIWI ERROR] {e}")
        return [float("nan")] * len(srcs)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\gerri\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
C:\Users\gerri\AppData\Roaming\Python\Python312\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\gerri\.cache\huggingface\hub\models--Unbabel--wmt20-comet-qe-da\snapshots\2e7ffc84fb67d99cf92506611766463bb9230cfb\checkpoints\model.ckpt`
Encoder model frozen.


In [None]:
# 5 Results + getting translations
def get_results_batched(examples, source_field, target_field, prompt_template, direction, results_list):
    strategies = ["greedy"]

    for strategy in strategies:
        print(f"\n[Strategy: {strategy}]")

        prompts = []
        sources = []
        references = []

        # Generate prompts and collect source/reference
        for ex in examples:
            source = ex[source_field]
            reference = ex[target_field]
            prompt = prompt_template.format(source=source)

            prompts.append(prompt)
            sources.append(source)
            references.append(reference)

        # Generate translations in batch + log probs + perplexities
        translations = []
        log_probs = []
        perplexities = []
        for prompt in tqdm(prompts, desc=f"Translating ({strategy})"):
            try:
                translation, log_prob, ppl = generate_translation(prompt, strategy)
                translations.append(translation)
                log_probs.append(log_prob)
                perplexities.append(ppl)
            except Exception as e:
                print(f"[ERROR] Strategy {strategy}: {e}")
                translations.append("")
                log_probs.append(float("nan"))
                perplexities.append(float("nan"))

        # Compute BLEU and METEOR in batch
        bleu_scores = compute_bleu(translations, references)
        meteor_scores = compute_meteor(translations, references)

        # Compute COMET scores (aligned)
        valid_indices = [i for i, t in enumerate(translations) if t.strip()]
        comet_refs = [float("nan")] * len(translations)
        comet_wmt = [float("nan")] * len(translations)

        try:
            valid_sources = [sources[i] for i in valid_indices]
            valid_refs = [references[i] for i in valid_indices]
            valid_trans = [translations[i] for i in valid_indices]

            comet_ref_scores = compute_comet_ref(valid_sources, valid_trans, valid_refs)
            comet_wmt_scores = compute_cometkiwi(valid_sources, valid_trans)

            for j, idx in enumerate(valid_indices):
                comet_refs[idx] = comet_ref_scores[j]
                comet_wmt[idx] = comet_wmt_scores[j]
        except Exception as e:
            print(f"[COMET ERROR] {e}")

        # Store results
        for i in range(len(translations)):
            results_list.append({
                "source": sources[i],
                "reference": references[i],
                "strategy": strategy,
                "translation": translations[i],
                "total_log_probs": log_probs[i],
                "perplexity": perplexities[i],
                "bleu": bleu_scores[i],
                "meteor": meteor_scores[i],
                "comet_ref": comet_refs[i],
                "comet_wmt22": comet_wmt[i]
            })

In [None]:
# Destination results
output_dir = "csv_results"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# eng to fra
results_to_fra = []
source_field = "sentence_eng_Latn"
target_field = "sentence_fra_Latn"
prompt_en2zh = """Task: Translate the following English text to French.

English text: {source}

French translation:""".strip()

get_results_batched(
    examples=lang_pairs["eng-fra"].select(range(250)),
    source_field=source_field,
    target_field=target_field,
    prompt_template=prompt_en2zh,
    direction="eng-fra",
    results_list=results_to_fra,
    model=model,
    tokenizer=tokenizer
)


[Strategy: greedy]


Translating (greedy): 100%|██████████| 250/250 [23:58<00:00,  5.76s/it]
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 16/16 [00:03<00:00,  4.37it/s]
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: Fa

In [None]:
# Store in CSV
results_to_fra_df = pd.DataFrame(results_to_fra)
results_to_fra_df.to_csv(os.path.join(output_dir, "bad_dist_eng-fra_bloomz-560_flores200_results.csv"), index=False)
print("---------------")
print(results_to_fra_df.head())

---------------
                                              source  \
0  On Monday, scientists from the Stanford Univer...   
1  Lead researchers say this may bring early dete...   
2  The JAS 39C Gripen crashed onto a runway at ar...   
3  The pilot was identified as Squadron Leader Di...   
4  Local media reports an airport fire vehicle ro...   

                                           reference strategy  \
0  Des scientifiques de l’école de médecine de l’...   greedy   
1  Selon les chercheurs principaux, cela pourrait...   greedy   
2  Le JAS 39C Gripen s’est écrasé sur une piste a...   greedy   
3  Le pilote a été identifié comme étant le chef ...   greedy   
4  La presse locale a rapporté qu'un véhicule de ...   greedy   

                                         translation  total_log_probs  \
0  Le lundi, des scientifiques de l'école de méde...       -25.285719   
1  Les chercheurs disent que cela pourrait apport...       -19.560266   
2  Le JAS 39C Gripen a heurté un runw