<a href="https://colab.research.google.com/github/GiacomoT23/Euro-MachineTranslation-SFT-and-RL/blob/main/de_en_qwen_SFT_updated_gh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [None]:
%%capture
!pip install --no-input --upgrade pip
!pip install --no-input unsloth bitsandbytes accelerate peft trl sentencepiece protobuf hf_transfer
!pip install --no-input transformers

# Model download

In [None]:
from unsloth import FastLanguageModel
import torch, os

USE_4BIT = True
USE_GC   = "unsloth"
MAX_SEQ  = 256
MODEL_ID = "unsloth/Qwen2.5-3B"

dtype = None if USE_4BIT else torch.float16
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_ID,
    max_seq_length = MAX_SEQ,
    dtype          = dtype,
    load_in_4bit   = USE_4BIT,
)

# LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = USE_GC,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {MODEL_ID} | 4bit={USE_4BIT} | GC={USE_GC} | seq={MAX_SEQ}")
print(f"Trainable params (LoRA+heads): ~{n_params:,}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.5: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.5 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model: unsloth/Qwen2.5-3B | 4bit=True | GC=unsloth | seq=256
Trainable params (LoRA+heads): ~29,933,568


# Loading splits
Loading the preprocessed dataset saved on drive

In [None]:
# === Load filtered dataset from Drive ===
from google.colab import drive
drive.mount("/content/drive")

from datasets import load_from_disk, DatasetDict

SAVE_DIR = "/content/drive/MyDrive/mt_datasets/wmt14_de-en_sample150000__filtered__len169_lid95_r2.25_ck60_trainVAL_filtered_testRAW"
ds_proc = load_from_disk(SAVE_DIR)

train_filt = ds_proc.get("train")
val_filt   = ds_proc.get("validation")
test_filt  = ds_proc.get("test")

print({k: len(v) for k, v in ds_proc.items()})

Mounted at /content/drive
{'train': 111966, 'validation': 2780, 'test': 3003}


# Formatting for SFT in unsloth
Formatting with system-user-assistant template. Model will be trained only on assistant responses.

In [None]:
from datasets import DatasetDict
import unicodedata
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

SRC, TGT = "de", "en"

def nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", (s or ""))

def ensure_src_tgt_cols(dset):
    cols = set(dset.column_names)
    if {"src_txt","tgt_txt"}.issubset(cols):
        return dset
    elif "translation" in cols:
        def to_cols(batch):
            src = [nfkc(x.get(SRC, "")) for x in batch["translation"]]
            tgt = [nfkc(x.get(TGT, "")) for x in batch["translation"]]
            return {"src_txt": src, "tgt_txt": tgt}
        return dset.map(
            to_cols, batched=True, batch_size=4096,
            desc="[normalize] add src_txt/tgt_txt from translation",
        )
    else:
        raise KeyError("Split has not src_txt/tgt_txt or translation.")

train_norm = ensure_src_tgt_cols(train_filt)
val_norm   = ensure_src_tgt_cols(val_filt) if val_filt is not None else None
test_norm  = ensure_src_tgt_cols(test_filt) if test_filt is not None else None

LANG_NAME = {"de": "German", "en": "English"}
def lang_name(code: str) -> str: return LANG_NAME.get(code.lower(), code.upper())

SYSTEM_TMPL = "You are a translation engine. Translate from {src_name} ({src_code}) to {tgt_name} ({tgt_code})."

def to_text_chatml(batch, src_code=SRC, tgt_code=TGT):
    srcs, tgts = batch["src_txt"], batch["tgt_txt"]
    texts = []
    sys_msg = SYSTEM_TMPL.format(
        src_name=lang_name(src_code), tgt_name=lang_name(tgt_code),
        src_code=src_code, tgt_code=tgt_code,
    )
    for s, t in zip(srcs, tgts):
        s1, t1 = (s or "").strip(), (t or "").strip()
        if not s1 or not t1:
            texts.append(""); continue
        messages = [
            {"role": "system", "content": sys_msg},
            {"role": "user",   "content": s1},
            {"role": "assistant", "content": t1},
        ]
        txt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )
        texts.append(txt)
    return {"text": texts}

train_ds = train_norm.map(
    to_text_chatml, batched=True,
    remove_columns=train_norm.column_names,
    desc="[chatml] make text (train)",
).filter(lambda ex: len(ex["text"]) > 0)

val_ds = None
if val_norm is not None:
    val_ds = val_norm.map(
        to_text_chatml, batched=True,
        remove_columns=val_norm.column_names,
        desc="[chatml] make text (val)",
    ).filter(lambda ex: len(ex["text"]) > 0)

print("Train/Val sizes:", len(train_ds), (0 if val_ds is None else len(val_ds)))
print("Esempio text:\n", train_ds[0]["text"])
print("Esempio text:\n", train_ds[1]["text"])



[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/111966 [00:00<?, ? examples/s]

[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/2780 [00:00<?, ? examples/s]

[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/3003 [00:00<?, ? examples/s]

[chatml] make text (train):   0%|          | 0/111966 [00:00<?, ? examples/s]

Filter:   0%|          | 0/111966 [00:00<?, ? examples/s]

[chatml] make text (val):   0%|          | 0/2780 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2780 [00:00<?, ? examples/s]

Train/Val sizes: 111966 2780
Esempio text:
 <|im_start|>system
You are a translation engine. Translate from German (de) to English (en).<|im_end|>
<|im_start|>user
Dabei handelt es sich um das Berliner Übereinkommen von 1937 und das Straßburger Übereinkommen von 1973, die beide recht alt, der heutigen Sachlage in Europa nicht mehr angemessen und demnach weitgehend überholt sind.<|im_end|>
<|im_start|>assistant
These are the 1937 Berlin agreement and the 1973 Strasbourg agreement which, because they are quite old, are not relevant to the situation we have in Europe today and have therefore become largely obsolete.<|im_end|>

Esempio text:
 <|im_start|>system
You are a translation engine. Translate from German (de) to English (en).<|im_end|>
<|im_start|>user
Dieser Mix führte wieder zu lebhaften Gesprächen unter den Teilnehmern in den Pausen und während der Abendveranstaltung, die von allen Beteiligten an dieser Veranstaltung in den Gesamtbeurteilungen als sehr informativ empfunden wurde

# SFT
SFT parameters are set here

In [None]:
# === SFT with TRL + W&B (Qwen2.5-3B, QLoRA) — ChatML + train_on_responses_only ===
import os, time, wandb, torch
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback
from unsloth.chat_templates import train_on_responses_only

# pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# W&B login
os.environ["WANDB_PROJECT"]   = "euromt"
os.environ["WANDB_NAME"]      = f"qwen25-3b-sft_chatml_{len(train_ds)}_{int(time.time())}"
os.environ["WANDB_WATCH"]     = "false"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_SILENT"]    = "true"
if not os.getenv("WANDB_API_KEY"):
    from getpass import getpass
    os.environ["WANDB_API_KEY"] = getpass("W&B API key: ")
wandb.login()
wandb.init(
    project="euromt",
    name="qwen25-3b-wmt14-de-en-sft_chatml",
    config={"model": MODEL_ID, "max_seq_len": 256, "use_4bit": True, "lora_r": 16, "train_rows": len(train_ds)},
    settings=wandb.Settings(start_method="thread"),
)

MAX_SEQ    = 256
PER_DEV_BS = 128
GRAD_ACCUM = 1

use_gpu  = torch.cuda.is_available()
name     = torch.cuda.get_device_name(0).lower() if use_gpu else ""
is_a100  = "a100" in name
bf16_flag = bool(is_a100)
fp16_flag = bool(use_gpu and not is_a100)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    dataset_text_field="text",
    max_length=MAX_SEQ,
    packing=False,
    args=SFTConfig(
        output_dir="outputs_sft",
        report_to=["wandb"],

        per_device_train_batch_size=PER_DEV_BS,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=2,
        max_steps=-1,

        learning_rate=1e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        weight_decay=0.01,
        max_grad_norm=0.6,
        optim="adamw_8bit",

        #group_by_length=True,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,

        logging_steps=20,
        eval_strategy="steps" if val_ds is not None else "no",
        eval_steps=150,
        save_strategy="steps",
        save_steps=150,
        save_total_limit=3,

        metric_for_best_model="eval_loss",
        greater_is_better=False,
        load_best_model_at_end=True,

        seed=3407,
        bf16=bf16_flag,
        fp16=fp16_flag,
    ),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)],
)

# Loss only on responses
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>system\n",
    response_part    = "<|im_start|>assistant\n",
)

print("Train/Val sizes:", len(train_ds), (0 if val_ds is None else len(val_ds)))
print("Precision:", "bf16" if bf16_flag else ("fp16" if fp16_flag else "fp32"))
print("Ready. Training on assistant spans only (Qwen 2.5 ChatML).")

W&B API key: ··········


  | |_| | '_ \/ _` / _` |  _/ -_)


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/111966 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/2780 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/111966 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2780 [00:00<?, ? examples/s]

Train/Val sizes: 111966 2780
Precision: fp16
Ready. Training on assistant spans only (Qwen 2.5 ChatML).


In [None]:
resume = False
trainer.train(resume_from_checkpoint=resume)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 111,966 | Num Epochs = 2 | Total steps = 1,750
O^O/ \_/ \    Batch size per device = 128 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (128 x 1 x 1) = 128
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)


Step,Training Loss,Validation Loss
200,1.4151,1.455451
400,1.3562,1.442189
600,1.3572,1.437055
800,1.3522,1.428804
1000,1.3229,1.436336
1200,1.3157,1.435536
1400,1.299,1.433643


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=1400, training_loss=1.3641400527954102, metrics={'train_runtime': 6402.4467, 'train_samples_per_second': 34.976, 'train_steps_per_second': 0.273, 'total_flos': 5.810809133829489e+17, 'train_loss': 1.3641400527954102, 'epoch': 1.6})

# Save checkpoint

In [None]:
# === Save on Google Drive the BEST (or LAST) checkpoint ===
from google.colab import drive
drive.mount("/content/drive")

import os, shutil, glob, time

OUT_DIR = "outputs_sft"  # SFTConfig.output_dir

def pick_checkpoint(out_dir):
    best = getattr(trainer.state, "best_model_checkpoint", None)
    if best and os.path.isdir(best):
        return best, "best"
    ckpts = sorted(glob.glob(os.path.join(out_dir, "checkpoint-*")), key=os.path.getmtime)
    if ckpts:
        return ckpts[-1], "last"
    return None, None

ckpt_path, kind = pick_checkpoint(OUT_DIR)
assert ckpt_path is not None, f"No checkpoint found in: {OUT_DIR}"

stamp = time.strftime("%Y%m%d_%H%M%S")
GDRIVE_DIR = f"/content/drive/MyDrive/mt_checkpoints/qwen25_sft_{kind}_{stamp}"

shutil.copytree(ckpt_path, GDRIVE_DIR)
tokenizer.save_pretrained(GDRIVE_DIR)

print(f"Copied {kind.upper()} checkpoint in:\n{GDRIVE_DIR}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copied BEST checkpoint in:
/content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_20250914_235740


# Load + check

In [None]:
# === LOAD: MOUNT + FIND CKPT + LOAD MODEL & DATA (Qwen2.5-3B, 4-bit) ===
# 0) Unsloth
import unsloth
from unsloth import FastLanguageModel

# 1) Google Drive
import os, glob, time, shutil
try:
    from google.colab import drive
    if not os.path.ismount("/content/drive"):
        drive.mount("/content/drive")
        print("Drive mounted.")
    else:
        print("Drive already mounted.")
except Exception as e:
    print("Colab not available:", e)

# --- CONFIG ---
MODEL_ID      = "unsloth/Qwen2.5-3B"
MAX_SEQ       = 256
USE_4BIT      = True
SRC, TGT      = "de", "en"
DATASET_DIR   = "/content/drive/MyDrive/mt_datasets/wmt14_de-en_sample150000__filtered__len169_lid95_r2.25_ck60_trainVAL_filtered_testRAW"
CKPT_ROOT     = "/content/drive/MyDrive/mt_checkpoints"

SPLIT               = "test"
MAX_EXAMPLES        = None
OUT_DIR             = "/content/drive/MyDrive/mt_eval/qwen3BevalCOMET"
os.makedirs(OUT_DIR, exist_ok=True)

# 2) Find ckpt LoRA on Drive
def find_adapter_dirs(root):
    hits = []
    if os.path.isdir(root):
        for p in glob.glob(os.path.join(root, "**", "adapter_model.safetensors"), recursive=True):
            adir = os.path.dirname(p)
            mtime = os.path.getmtime(p)
            hits.append((mtime, adir))
    return sorted(hits, reverse=True)

hits = find_adapter_dirs(CKPT_ROOT)
if hits:
    CKPT_DIR = hits[0][1]
    print("CKPT su Drive:", CKPT_DIR)
else:
    print("No adapter in", CKPT_ROOT)
    raise FileNotFoundError("No adapter_model.safetensors found.")

# 3) Load
import torch, bitsandbytes as bnb
from collections import Counter
from peft import PeftModel
from unsloth.chat_templates import get_chat_template
from datasets import load_from_disk, DatasetDict, load_dataset

dtype = None if USE_4BIT else torch.float16
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_ID,
    max_seq_length = MAX_SEQ,
    dtype          = dtype,
    load_in_4bit   = USE_4BIT,
)
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
if tokenizer.pad_token is None:
    print("Pad was None")
    tokenizer.pad_token = tokenizer.eos_token
print(f"Is equal {tokenizer.pad_token==tokenizer.eos_token}")
tokenizer.padding_side = "left"

model = PeftModel.from_pretrained(base_model, CKPT_DIR)
FastLanguageModel.for_inference(model)
model.eval()

# 4) Load dataset
def nfkc(s: str):
    return (s or "").strip()

def project_to_src_tgt(dset, src_code=SRC, tgt_code=TGT):
    cols = set(dset.column_names)
    if {"src_txt","tgt_txt"}.issubset(cols):
        return dset
    elif "translation" in cols:
        def to_cols(batch):
            src = [nfkc(ex.get(src_code, "")) for ex in batch["translation"]]
            tgt = [nfkc(ex.get(tgt_code, "")) for ex in batch["translation"]]
            return {"src_txt": src, "tgt_txt": tgt}
        return dset.map(to_cols, batched=True, desc="Project translation → src/tgt")
    else:
        raise ValueError(f"Split not recognized: {cols}")

ds_eval = DatasetDict()
if os.path.isdir(DATASET_DIR):
    ds_proc = load_from_disk(DATASET_DIR)
    for sp in ("train","validation","test"):
        if sp in ds_proc:
            ds_eval[sp] = project_to_src_tgt(ds_proc[sp])
else:
    raw = load_dataset("wmt14", "de-en")
    ds_eval["test"] = project_to_src_tgt(raw["test"])

print("Splits:", {k: len(v) for k, v in ds_eval.items()})

# 5) Check
print("\n=== Check ===")
print("PeftModel?            ", isinstance(model, PeftModel))
print("Adapter dir           ", CKPT_DIR)
print("Files adapter presenti",
      os.path.isfile(os.path.join(CKPT_DIR, "adapter_model.safetensors")),
      os.path.isfile(os.path.join(CKPT_DIR, "adapter_config.json")))
lora_params = [n for n,_ in model.named_parameters() if "lora_" in n]
print("Tensori LoRA          ", len(lora_params))
mods = [m for m in model.modules() if isinstance(m, (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt))]
print("Layer types           ", Counter(type(m).__name__ for m in mods))

# 6) Variables
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LANG_NAME = {"de":"German","en":"English"}
def lang_name(c): return LANG_NAME.get(c.lower(), c.upper())
SYSTEM_TMPL = "You are a translation engine. Translate from {src_name} ({src_code}) to {tgt_name} ({tgt_code})."
SYSTEM_TEXT = SYSTEM_TMPL.format(
    src_name=lang_name(SRC), src_code=SRC, tgt_name=lang_name(TGT), tgt_code=TGT
)
print("\n Ready: model/dataset loaded.")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Mounted at /content/drive
Drive mounted.
✓ CKPT su Drive: /content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_20250914_235740
==((====))==  Unsloth 2025.9.5: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Is equal False
Splits: {'train': 111966, 'validation': 2780, 'test': 3003}

=== Check ===
PeftModel?             True
Adapter dir            /content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_20250914_235740
Files adapter presenti True True
Tensori LoRA           504
Layer types            Counter({'Linear4bit': 237})

 Ready: model/dataset loaded.


In [None]:
# === PREDICT  ===
import os, time, re, torch, pandas as pd
from unsloth.chat_templates import get_chat_template

# Config
SPLIT="test"; BATCH_SIZE=512; MAX_EXAMPLES=None; MAX_NEW_TOKENS=128; DO_SAMPLE=False; PRINT_EVERY=200
tok = get_chat_template(tokenizer, chat_template="qwen-2.5")
tok.padding_side="left"
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side="left"
def build_messages(src):
    return [{"role":"system","content":SYSTEM_TEXT},{"role":"user","content":src.strip()}]
# Dati
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dset = ds_eval[SPLIT]
take = len(dset) if MAX_EXAMPLES is None else min(MAX_EXAMPLES,len(dset))
sources = dset.select(range(take))["src_txt"]; refs = dset.select(range(take))["tgt_txt"]
# Stop & ban
im_end = tok.convert_tokens_to_ids("<|im_end|>"); eos = tok.eos_token_id
im_start = tok.convert_tokens_to_ids("<|im_start|>")
eos_ids = [t for t in (im_end, eos) if t is not None]
# Postprocess
MARKERS = ("<|im_end|>","<|endoftext|>","<|im_start|>","初始化","始化")
def clean(txt):
    p = min([txt.find(m) for m in MARKERS if m in txt] + [len(txt)])
    txt = txt[:p]
    if "\n" in txt: txt = txt.split("\n",1)[0]
    m = re.search(r"^(.{1,300}?[\.!\?])( |$)", txt)
    return (m.group(1) if m else txt).replace("\uFFFD","").strip()
print(f"[BATCH] {take} examples, bs={BATCH_SIZE}")
t0=time.time(); hyps=[]
with torch.inference_mode():
    for i in range(0, take, BATCH_SIZE):
        print("inizio batch")
        tok.padding_side="left"
        batch = sources[i:i+BATCH_SIZE]
        prompts = [tok.apply_chat_template(build_messages(s), tokenize=False, add_generation_prompt=True) for s in batch]
        tok.padding_side="left"
        enc = tok(prompts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SEQ).to(dev)
        tok.padding_side="left"
        cut = enc["input_ids"].shape[1]
        tok.padding_side="left"
        outs = model.generate(
            input_ids=enc["input_ids"], attention_mask=enc.get("attention_mask"),
            max_new_tokens=MAX_NEW_TOKENS, do_sample=DO_SAMPLE, eos_token_id=eos_ids,
            pad_token_id=tok.pad_token_id, early_stopping=True, use_cache=True,
        )
        tok.padding_side="left"
        for j in range(outs.shape[0]):
            tok.padding_side="left"
            gen = outs[j, cut:]
            hyps.append(clean(tok.decode(gen, skip_special_tokens=False)))
        if (i+BATCH_SIZE) % PRINT_EVERY==0 or i+BATCH_SIZE>=take: print(f"[{min(i+BATCH_SIZE,take)}/{take}]")
        tok.padding_side="left"
print(f"Done in {time.time()-t0:.1f}s")
# Save csv and print preview
stamp=time.strftime("%Y%m%d-%H%M%S"); csv_path=os.path.join(OUT_DIR, f"gen_de-en_{SPLIT}_{stamp}.csv")
pd.DataFrame({"src":sources, "hyp":hyps, "ref":refs}).to_csv(csv_path, index=False)
print("Saved:", csv_path)
for k in range(min(3,len(hyps))):
    print(f"\n[{k}] SRC: {sources[k]}\nHYP: {hyps[k]}\nREF: {refs[k]}")

[BATCH] 3003 examples, bs=512
inizio batch
inizio batch
inizio batch
inizio batch
inizio batch
inizio batch
[3003/3003]
Done in 265.7s
Saved: /content/drive/MyDrive/mt_eval/qwen3BevalCOMET/gen_de-en_test_20250916-001201.csv

[0] SRC: Gutach: Noch mehr Sicherheit für Fußgänger
HYP: Assessor: More safety for pedestrians.
REF: Gutach: Increased safety for pedestrians

[1] SRC: Sie stehen keine 100 Meter voneinander entfernt: Am Dienstag ist in Gutach die neue B 33-Fußgängerampel am Dorfparkplatz in Betrieb genommen worden - in Sichtweite der älteren Rathausampel.
HYP: They are only 100 metres apart: on Tuesday, the new B 33 pedestrian crossing was opened in Gutach, in sight of the older Rathaus crossing.
REF: They are not even 100 metres apart: On Tuesday, the new B 33 pedestrian lights in Dorfparkplatz in Gutach became operational - within view of the existing Town Hall traffic lights.

[2] SRC: Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?
HYP: Two factories so clos

# COMET

In [None]:
# === COMET eval: load CSV (src,hyp,ref) ===
import os, glob, pandas as pd, torch, time, shutil
import unsloth
from unsloth import FastLanguageModel
try:
    from google.colab import drive
    if not os.path.ismount("/content/drive"):
        drive.mount("/content/drive")
        print("Drive mounted.")
    else:
        print("Drive already mounted.")
except Exception as e:
    print("Colab not available:", e)

# 0) Install COMET
try:
    from comet import download_model, load_from_checkpoint
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "unbabel-comet"])
    from comet import download_model, load_from_checkpoint

# 1) Find last generated csv CSV generated or set csv path
OUT_DIR = "/content/drive/MyDrive/mt_eval/qwen3BevalCOMET"
CSV_PATH = "/content/drive/MyDrive/mt_eval/qwen3BevalCOMET/gen_de-en_test_20250916-001201.csv" #customize
print("CSV found:", CSV_PATH)

# 2) Read
df = pd.read_csv(CSV_PATH)
required = {"src","hyp","ref"}
missing = required - set(df.columns)
assert not missing, f"Missing columns in csv: {missing}"
data = [{"src": s, "mt": h, "ref": r} for s,h,r in zip(df["src"], df["hyp"], df["ref"])]

# 3) Load COMET (ref-based)
model_ckpt = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_ckpt)

# 4) Predict
use_gpus = 1 if torch.cuda.is_available() else 0
output = model.predict(data, batch_size=64, gpus=use_gpus, progress_bar=True)
system_score = float(output.system_score)
print(f"COMET (system): {system_score:.6f}")
# 5) Save COMET score
score_path = os.path.splitext(CSV_PATH)[0] + "_comet.txt"
with open(score_path, "w") as f:
    f.write(f"{system_score:.6f}\n")

print("Saved:", score_path)


Drive already mounted.
CSV found: /content/drive/MyDrive/mt_eval/qwen3BevalCOMET/gen_de-en_test_20250916-001201.csv


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/sta

Prediction({'scores': [0.8927652835845947, 0.7647638320922852, 0.6845570206642151, 0.8375985622406006, 0.8480094075202942, 0.8370135426521301, 0.8407938480377197, 0.8248785138130188, 0.8547746539115906, 0.8231543898582458, 0.8439663052558899, 0.8647288680076599, 0.8259732723236084, 0.763944149017334, 0.7457390427589417, 0.8799477219581604, 0.8735669255256653, 0.8060024976730347, 0.9095523953437805, 0.8526321053504944, 0.8335078954696655, 0.9200155138969421, 0.8423763513565063, 0.8477757573127747, 0.8154047131538391, 0.8102858662605286, 0.8019640445709229, 0.7497314214706421, 0.7094261050224304, 0.6986905932426453, 0.7038756608963013, 0.8170358538627625, 0.8121245503425598, 0.7542086243629456, 0.7542465329170227, 0.7662191987037659, 0.5550954937934875, 0.892902672290802, 0.840292751789093, 0.9509729743003845, 0.8627979159355164, 0.887315034866333, 0.882462203502655, 0.855476975440979, 0.8113126754760742, 0.9200817942619324, 0.8407248854637146, 0.8477080464363098, 0.8272284865379333, 0.9

In [None]:
# %% Monta Drive + rimuovi widget dal notebook e salva copia "clean" (compatibile GitHub)
from google.colab import drive
import os, sys, subprocess

# --- MOUNT DRIVE ---
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive")

# === CONFIG ===
NB_IN  = "/content/drive/MyDrive/Colab Notebooks/de-en_qwen_SFT_updated.ipynb"  # <-- metti qui il tuo file
OVERWRITE = False                                              # True per sovrascrivere l'originale
NB_OUT = NB_IN if OVERWRITE else NB_IN.replace(".ipynb", "_gh.ipynb")

# --- deps ---
try:
    import nbformat  # type: ignore
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "nbformat"])
    import nbformat  # type: ignore

# --- load ---
nb = nbformat.read(NB_IN, as_version=4)

# --- strip widget state at notebook level ---
for k in ["widgets", "widget_state", "application/vnd.jupyter.widget-state+json"]:
    nb.metadata.pop(k, None)
if isinstance(nb.metadata.get("extensions"), dict):
    nb.metadata["extensions"].pop("jupyter_widget_state", None)

# --- strip per-cell widget metadata & outputs ---
removed = 0
for cell in nb.cells:
    if isinstance(getattr(cell, "metadata", {}), dict):
        for k in ["widgets", "widget_view", "widget_state"]:
            cell.metadata.pop(k, None)
    outs = []
    for out in getattr(cell, "outputs", []):
        data = getattr(out, "data", None)
        is_widget = False
        if isinstance(data, dict):
            wk = [k for k in list(data) if k.startswith("application/vnd.jupyter.widget")]
            if wk:
                is_widget = True
                for k in wk: data.pop(k, None)
            if is_widget and not data:
                removed += 1
                continue
        outs.append(out)
    cell.outputs = outs

# --- save ---
nbformat.write(nb, NB_OUT)
print("Saved:", NB_OUT)
print("Widget outputs removed:", removed)
if OVERWRITE:
    print("ATTENZIONE: file originale sovrascritto.")


ValueError: mount failed