<a href="https://colab.research.google.com/github/GiacomoT23/Euro-MachineTranslation-SFT-and-RL/blob/main/de_en_qwen_SFT_updated_gh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [None]:
%%capture
!pip install --no-input --upgrade pip
!pip install --no-input unsloth bitsandbytes accelerate peft trl sentencepiece protobuf hf_transfer
!pip install --no-input transformers

# Model download

In [None]:
from unsloth import FastLanguageModel
import torch, os

USE_4BIT = True
USE_GC   = "unsloth"
MAX_SEQ  = 256
MODEL_ID = "unsloth/Qwen2.5-3B"

dtype = None if USE_4BIT else torch.float16
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_ID,
    max_seq_length = MAX_SEQ,
    dtype          = dtype,
    load_in_4bit   = USE_4BIT,
)

# LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = USE_GC,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {MODEL_ID} | 4bit={USE_4BIT} | GC={USE_GC} | seq={MAX_SEQ}")
print(f"Trainable params (LoRA+heads): ~{n_params:,}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.4: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.4 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model: unsloth/Qwen2.5-3B | 4bit=True | GC=unsloth | seq=256
Trainable params (LoRA+heads): ~29,933,568


# Loading splits
Loading the preprocessed dataset saved on drive

In [None]:
# === Load filtered dataset from Drive ===
from google.colab import drive
drive.mount("/content/drive")

from datasets import load_from_disk, DatasetDict

SAVE_DIR = "/content/drive/MyDrive/mt_datasets/wmt14_de-en_sample150000__filtered__len169_lid95_r2.25_ck60_trainVAL_filtered_testRAW"
ds_proc = load_from_disk(SAVE_DIR)

train_filt = ds_proc.get("train")
val_filt   = ds_proc.get("validation")
test_filt  = ds_proc.get("test")

print({k: len(v) for k, v in ds_proc.items()})

Mounted at /content/drive
{'train': 111966, 'validation': 2780, 'test': 3003}


# Formatting for SFT in unsloth
Formatting with system-user-assistant template. Model will be trained only on assistant responses.

In [None]:
from datasets import DatasetDict
import unicodedata
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

SRC, TGT = "de", "en"

def nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", (s or ""))

def ensure_src_tgt_cols(dset):
    cols = set(dset.column_names)
    if {"src_txt","tgt_txt"}.issubset(cols):
        return dset
    elif "translation" in cols:
        def to_cols(batch):
            src = [nfkc(x.get(SRC, "")) for x in batch["translation"]]
            tgt = [nfkc(x.get(TGT, "")) for x in batch["translation"]]
            return {"src_txt": src, "tgt_txt": tgt}
        return dset.map(
            to_cols, batched=True, batch_size=4096,
            desc="[normalize] add src_txt/tgt_txt from translation",
        )
    else:
        raise KeyError("Split has not src_txt/tgt_txt or translation.")

train_norm = ensure_src_tgt_cols(train_filt)
val_norm   = ensure_src_tgt_cols(val_filt) if val_filt is not None else None
test_norm  = ensure_src_tgt_cols(test_filt) if test_filt is not None else None

LANG_NAME = {"de": "German", "en": "English"}
def lang_name(code: str) -> str: return LANG_NAME.get(code.lower(), code.upper())

SYSTEM_TMPL = "You are a translation engine. Translate from {src_name} ({src_code}) to {tgt_name} ({tgt_code})."

def to_text_chatml(batch, src_code=SRC, tgt_code=TGT):
    srcs, tgts = batch["src_txt"], batch["tgt_txt"]
    texts = []
    sys_msg = SYSTEM_TMPL.format(
        src_name=lang_name(src_code), tgt_name=lang_name(tgt_code),
        src_code=src_code, tgt_code=tgt_code,
    )
    for s, t in zip(srcs, tgts):
        s1, t1 = (s or "").strip(), (t or "").strip()
        if not s1 or not t1:
            texts.append(""); continue
        messages = [
            {"role": "system", "content": sys_msg},
            {"role": "user",   "content": s1},
            {"role": "assistant", "content": t1},
        ]
        txt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )
        texts.append(txt)
    return {"text": texts}

train_ds = train_norm.map(
    to_text_chatml, batched=True,
    remove_columns=train_norm.column_names,
    desc="[chatml] make text (train)",
).filter(lambda ex: len(ex["text"]) > 0)

val_ds = None
if val_norm is not None:
    val_ds = val_norm.map(
        to_text_chatml, batched=True,
        remove_columns=val_norm.column_names,
        desc="[chatml] make text (val)",
    ).filter(lambda ex: len(ex["text"]) > 0)

print("Train/Val sizes:", len(train_ds), (0 if val_ds is None else len(val_ds)))
print("Esempio text:\n", train_ds[0]["text"])
print("Esempio text:\n", train_ds[1]["text"])



[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/111966 [00:00<?, ? examples/s]

[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/2780 [00:00<?, ? examples/s]

[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/3003 [00:00<?, ? examples/s]

[chatml] make text (train):   0%|          | 0/111966 [00:00<?, ? examples/s]

Filter:   0%|          | 0/111966 [00:00<?, ? examples/s]

[chatml] make text (val):   0%|          | 0/2780 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2780 [00:00<?, ? examples/s]

Train/Val sizes: 111966 2780
Esempio text:
 <|im_start|>system
You are a translation engine. Translate from German (de) to English (en).<|im_end|>
<|im_start|>user
Dabei handelt es sich um das Berliner Übereinkommen von 1937 und das Straßburger Übereinkommen von 1973, die beide recht alt, der heutigen Sachlage in Europa nicht mehr angemessen und demnach weitgehend überholt sind.<|im_end|>
<|im_start|>assistant
These are the 1937 Berlin agreement and the 1973 Strasbourg agreement which, because they are quite old, are not relevant to the situation we have in Europe today and have therefore become largely obsolete.<|im_end|>

Esempio text:
 <|im_start|>system
You are a translation engine. Translate from German (de) to English (en).<|im_end|>
<|im_start|>user
Dieser Mix führte wieder zu lebhaften Gesprächen unter den Teilnehmern in den Pausen und während der Abendveranstaltung, die von allen Beteiligten an dieser Veranstaltung in den Gesamtbeurteilungen als sehr informativ empfunden wurde

# SFT
SFT parameters are set here

In [None]:
# === SFT with TRL + W&B (Qwen2.5-3B, QLoRA) — ChatML + train_on_responses_only ===
import os, time, wandb, torch
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback
from unsloth.chat_templates import train_on_responses_only

# pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# W&B login
os.environ["WANDB_PROJECT"]   = "euromt"
os.environ["WANDB_NAME"]      = f"qwen25-3b-sft_chatml_{len(train_ds)}_{int(time.time())}"
os.environ["WANDB_WATCH"]     = "false"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_SILENT"]    = "true"
if not os.getenv("WANDB_API_KEY"):
    from getpass import getpass
    os.environ["WANDB_API_KEY"] = getpass("W&B API key: ")
wandb.login()
wandb.init(
    project="euromt",
    name="qwen25-3b-wmt14-de-en-sft_chatml",
    config={"model": MODEL_ID, "max_seq_len": 256, "use_4bit": True, "lora_r": 16, "train_rows": len(train_ds)},
    settings=wandb.Settings(start_method="thread"),
)

MAX_SEQ    = 256
PER_DEV_BS = 128
GRAD_ACCUM = 1

use_gpu  = torch.cuda.is_available()
name     = torch.cuda.get_device_name(0).lower() if use_gpu else ""
is_a100  = "a100" in name
bf16_flag = bool(is_a100)
fp16_flag = bool(use_gpu and not is_a100)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    dataset_text_field="text",
    max_length=MAX_SEQ,
    packing=False,
    args=SFTConfig(
        output_dir="outputs_sft",
        report_to=["wandb"],

        per_device_train_batch_size=PER_DEV_BS,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=2,
        max_steps=-1,

        learning_rate=1e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        weight_decay=0.01,
        max_grad_norm=0.6,
        optim="adamw_8bit",

        #group_by_length=True,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,

        logging_steps=20,
        eval_strategy="steps" if val_ds is not None else "no",
        eval_steps=150,
        save_strategy="steps",
        save_steps=150,
        save_total_limit=3,

        metric_for_best_model="eval_loss",
        greater_is_better=False,
        load_best_model_at_end=True,

        seed=3407,
        bf16=bf16_flag,
        fp16=fp16_flag,
    ),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)],
)

# Loss only on responses
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>system\n",
    response_part    = "<|im_start|>assistant\n",
)

print("Train/Val sizes:", len(train_ds), (0 if val_ds is None else len(val_ds)))
print("Precision:", "bf16" if bf16_flag else ("fp16" if fp16_flag else "fp32"))
print("Ready. Training on assistant spans only (Qwen 2.5 ChatML).")

W&B API key: ··········


  | |_| | '_ \/ _` / _` |  _/ -_)


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/111966 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/2780 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/111966 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/2780 [00:00<?, ? examples/s]

Train/Val sizes: 111966 2780
Precision: bf16
Ready. Training on assistant spans only (Qwen 2.5 ChatML).


In [None]:
resume = False
trainer.train(resume_from_checkpoint=resume)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 111,966 | Num Epochs = 2 | Total steps = 1,750
O^O/ \_/ \    Batch size per device = 128 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (128 x 1 x 1) = 128
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)


Step,Training Loss,Validation Loss
200,1.4151,1.455451
400,1.3562,1.442189
600,1.3572,1.437055
800,1.3522,1.428804
1000,1.3229,1.436336
1200,1.3157,1.435536
1400,1.299,1.433643


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=1400, training_loss=1.3641400527954102, metrics={'train_runtime': 6402.4467, 'train_samples_per_second': 34.976, 'train_steps_per_second': 0.273, 'total_flos': 5.810809133829489e+17, 'train_loss': 1.3641400527954102, 'epoch': 1.6})

# Save checkpoint

In [None]:
# === Save on Google Drive the BEST (or LAST) checkpoint ===
from google.colab import drive
drive.mount("/content/drive")

import os, shutil, glob, time

OUT_DIR = "outputs_sft"  # SFTConfig.output_dir

def pick_checkpoint(out_dir):
    best = getattr(trainer.state, "best_model_checkpoint", None)
    if best and os.path.isdir(best):
        return best, "best"
    ckpts = sorted(glob.glob(os.path.join(out_dir, "checkpoint-*")), key=os.path.getmtime)
    if ckpts:
        return ckpts[-1], "last"
    return None, None

ckpt_path, kind = pick_checkpoint(OUT_DIR)
assert ckpt_path is not None, f"No checkpoint found in: {OUT_DIR}"

stamp = time.strftime("%Y%m%d_%H%M%S")
GDRIVE_DIR = f"/content/drive/MyDrive/mt_checkpoints/qwen25_sft_{kind}_{stamp}"

shutil.copytree(ckpt_path, GDRIVE_DIR)
tokenizer.save_pretrained(GDRIVE_DIR)  # save also tokenizer

print(f"Copied {kind.upper()} checkpoint in:\n{GDRIVE_DIR}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copied BEST checkpoint in:
/content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_20250914_235740


# Load checkpoint

In [None]:
# === LOAD best checkpoint (LoRA) + tokenizer + dataset from Drive ===
from google.colab import drive
drive.mount("/content/drive")

import os, glob, unicodedata, torch
from unsloth import FastLanguageModel
from peft import PeftModel
from datasets import load_from_disk, DatasetDict
from unsloth.chat_templates import get_chat_template

# --- CONFIG ---
MODEL_ID   = "unsloth/Qwen2.5-3B"
MAX_SEQ    = 256
USE_4BIT   = True
SRC, TGT   = "de", "en"

# checkpoint da Drive (quello salvato nella cella 1)
CKPT_DIR   = "/content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_YYYYMMDD_HHMMSS"
print("Checkpoint:", CKPT_DIR)

# Dataset
DATASET_DIR = "/content/drive/MyDrive/mt_datasets/wmt14_de-en_proc"

# --- Load base model + tokenizer ---
dtype = None if USE_4BIT else torch.float16
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_ID,
    max_seq_length = MAX_SEQ,
    dtype          = dtype,
    load_in_4bit   = USE_4BIT,
)
# Apply chat template
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
base_model.generation_config.pad_token_id = tokenizer.pad_token_id
base_model.generation_config.eos_token_id = tokenizer.eos_token_id
base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.config.eos_token_id = tokenizer.eos_token_id

# --- Attach Lora adapters ---
model = PeftModel.from_pretrained(base_model, CKPT_DIR)
model.eval()

# --- Load standardized dataset ---
def nfkc(s: str) -> str: return unicodedata.normalize("NFKC", (s or "").strip())

def project_to_src_tgt(dset, src_code=SRC, tgt_code=TGT):
    cols = set(dset.column_names)
    if {"src_txt","tgt_txt"}.issubset(cols):
        return dset
    elif "translation" in cols:
        def to_cols(batch):
            src = [nfkc(ex.get(src_code, "")) for ex in batch["translation"]]
            tgt = [nfkc(ex.get(tgt_code, "")) for ex in batch["translation"]]
            return {"src_txt": src, "tgt_txt": tgt}
        return dset.map(to_cols, batched=True, desc="Project translation → src/tgt")
    else:
        raise ValueError(f"Split schema non riconosciuto: {cols}")

from datasets import load_dataset
ds_eval = DatasetDict()
if os.path.isdir(DATASET_DIR):
    ds_proc = load_from_disk(DATASET_DIR)
    for sp in ("train","validation","test"):
        if sp in ds_proc: ds_eval[sp] = project_to_src_tgt(ds_proc[sp])
else:
    raw = load_dataset("wmt14", "de-en")
    ds_eval["test"] = project_to_src_tgt(raw["test"])

print("✓ Splits:", {k: len(v) for k, v in ds_eval.items()})


# Evaluate with **comet**

In [None]:
# === Fast GENERATION + COMET evaluation on TEST (ChatML-consistent) ===
!pip -q install "unbabel-comet>=2.2.4"

import os, time, numpy as np, pandas as pd, torch
from tqdm.auto import tqdm
from comet import download_model, load_from_checkpoint
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
FastLanguageModel.for_inference(model)
model.eval()

# ----------------------- Config -----------------------
SPLIT               = "test"
MAX_EVAL_EXAMPLES   = None
GEN_BATCH_SIZE      = 128
MAX_NEW_TOKENS      = 128
DO_SAMPLE           = False
COMET_BATCH_SIZE    = 128
BASE_DIR            = "/content/drive/MyDrive/mt_eval"

SRC, TGT = "de", "en"
LANG_NAME = {"de": "German", "en": "English"}
def lang_name(code: str) -> str: return LANG_NAME.get(code.lower(), code.upper())

SYSTEM_TMPL = "You are a translation engine. Translate from {src_name} ({src_code}) to {tgt_name} ({tgt_code})."

assert SPLIT in ds_eval and {"src_txt","tgt_txt"}.issubset(ds_eval[SPLIT].column_names)
dset = ds_eval[SPLIT]
n_all = len(dset)
take = n_all if MAX_EVAL_EXAMPLES is None else min(MAX_EVAL_EXAMPLES, n_all)
subset = dset.select(range(take))
sources = list(subset["src_txt"])
refs    = list(subset["tgt_txt"])
print(f"Evaluating {len(sources)}/{n_all} examples on '{SPLIT}'")

# ----------------------- Generation (ChatML) -----------------------
@torch.inference_mode()
def generate_batch_chatml(src_batch, src_code=SRC, tgt_code=TGT):
    sys_msg = SYSTEM_TMPL.format(
        src_name=lang_name(src_code), tgt_name=lang_name(tgt_code),
        src_code=src_code, tgt_code=tgt_code,
    )
    messages_list = [
        [
            {"role": "system", "content": sys_msg},
            {"role": "user",   "content": s.strip()},
        ]
        for s in src_batch
    ]

    # Left padding for generation
    prev_pad, prev_trunc = tokenizer.padding_side, getattr(tokenizer, "truncation_side", "right")
    tokenizer.padding_side = "left"
    tokenizer.truncation_side = "left"

    try:
        enc = tokenizer.apply_chat_template(
            messages_list,
            tokenize=True,
            add_generation_prompt=True,
            padding=True,
            truncation=True,
            max_length=MAX_SEQ,
            return_tensors="pt",
        ).to(device)

        outputs = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=DO_SAMPLE,
            use_cache=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    finally:
        tokenizer.padding_side = prev_pad
        tokenizer.truncation_side = prev_trunc

    # Decode only new tokens
    cut = enc["input_ids"].size(1)
    hyps = [tokenizer.decode(outputs[i, cut:], skip_special_tokens=True).strip()
            for i in range(outputs.size(0))]
    return hyps

t0 = time.time()
hyps = []
for i in tqdm(range(0, len(sources), GEN_BATCH_SIZE), desc="Generating"):
    hyps.extend(generate_batch_chatml(sources[i:i+GEN_BATCH_SIZE]))
t1 = time.time()
assert len(hyps) == len(refs)
print(f"\nGeneration: {len(hyps)} examples in {t1-t0:.1f}s")

# ----------------------- COMET (wmt22-da) -----------------------
model_path  = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(model_path)
data = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(sources, hyps, refs)]

# Free VRAM
try: model.to("cpu")
except: pass
del enc, outputs
import gc; gc.collect(); torch.cuda.empty_cache()

t0 = time.time()
res = comet_model.predict(
    data,
    batch_size=COMET_BATCH_SIZE,
    gpus=(1 if torch.cuda.is_available() else 0),
)
t1 = time.time()

#Normalize output
if isinstance(res, tuple):
    seg_scores, sys_score = res
else:
    seg_scores = res.get("segments_scores") or res.get("segment_scores") or res.get("scores")
    sys_score  = res.get("system_score")   or res.get("score") or res.get("mean")

def to_float_safe(x):
    try: return float(x)
    except: return float("nan")

sys_score_f = to_float_safe(sys_score)
print(f"COMET (wmt22-da) system: {sys_score_f:.4f} | {len(data)} segs in {t1-t0:.1f}s")
print("First 5 seg scores:", (seg_scores[:5] if isinstance(seg_scores, list) else seg_scores))

# ----------------------- Save outputs -----------------------
import json, time, numpy as np, os
RUN_STAMP = time.strftime("%Y%m%d-%H%M%S")
DECODE_MODE = "greedy" if not DO_SAMPLE else "sample"
TAG = f"{SRC}-{TGT}_{SPLIT}_b{GEN_BATCH_SIZE}_t{MAX_NEW_TOKENS}_{DECODE_MODE}"
OUT_DIR = os.path.join(BASE_DIR, f"{TAG}_{RUN_STAMP}")
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.DataFrame({
    "src": sources,
    "hyp": hyps,
    "ref": refs,
    "comet": np.asarray(seg_scores, dtype=np.float32)[:len(hyps)],
})
csv_path = os.path.join(OUT_DIR, "segments.csv")
df.to_csv(csv_path, index=False)

summary = {
    "split": SPLIT,
    "n_examples": len(hyps),
    "system_score_comet_da": float(sys_score_f),
    "decode": {
        "mode": DECODE_MODE,
        "max_new_tokens": int(MAX_NEW_TOKENS),
        "gen_batch_size": int(GEN_BATCH_SIZE),
        "do_sample": bool(DO_SAMPLE),
    },
    "model": {"hf_id": globals().get("MODEL_ID", None)},
    "paths": {"segments_csv": csv_path},
    "timestamp": RUN_STAMP,
}
with open(os.path.join(OUT_DIR, "summary.json"), "w") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

with open(os.path.join(OUT_DIR, "hypotheses.txt"), "w", encoding="utf-8") as f:
    for h in hyps: f.write(h.replace("\n", " ").strip() + "\n")

print("Saved in:", OUT_DIR)
print("COMET:", round(sys_score_f, 4))
