<a href="https://colab.research.google.com/github/GiacomoT23/Euro-MachineTranslation-SFT-and-RL/blob/main/de_en_SFT_gh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model download

As model to be finetuned, Qwen2.5 1.5B base was selected. It was chosen because of its size compatible with the available resources and because it was pretrained also on the selected languages and much more.

In [None]:
%%capture
!pip install --no-input --upgrade pip
!pip install --no-input unsloth bitsandbytes accelerate peft "trl>=0.9.0" sentencepiece protobuf hf_transfer
!pip install --no-input "transformers==4.55.4"

from unsloth import FastLanguageModel
import torch, os

USE_4BIT = True
USE_GC   = "unsloth"
MAX_SEQ  = 256
MODEL_ID = "unsloth/Qwen2.5-1.5B"

dtype = None if USE_4BIT else torch.float16
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_ID,
    max_seq_length = MAX_SEQ,
    dtype          = dtype,
    load_in_4bit   = USE_4BIT,
)

# LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = USE_GC,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: {MODEL_ID} | 4bit={USE_4BIT} | GC={USE_GC} | seq={MAX_SEQ}")
print(f"Trainable params (LoRA+heads): ~{n_params:,}")


# Loading splits
Loading the preprocessed dataset saved on drive

In [None]:
# === Load dataset filtrato da Google Drive ===
from google.colab import drive
drive.mount("/content/drive")

from datasets import load_from_disk, DatasetDict

SAVE_DIR = "/content/drive/MyDrive/mt_datasets/wmt14_de_en_proc_100k_len203_sim082_lid099"
ds_proc = load_from_disk(SAVE_DIR)

train_filt = ds_proc.get("train")
val_filt   = ds_proc.get("validation")
test_filt  = ds_proc.get("test")

print({k: len(v) for k, v in ds_proc.items()})


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'train': 62240, 'validation': 3000, 'test': 3003}


# Formatting for SFT in unsloth

In [None]:
from datasets import DatasetDict
import unicodedata

SRC, TGT = "de", "en"

def nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", (s or ""))

def ensure_src_tgt_cols(dset):
    """Return dataset with columns src_txt/tgt_txt.
    """
    cols = set(dset.column_names)
    if {"src_txt","tgt_txt"}.issubset(cols):
        return dset
    elif "translation" in cols:
        def to_cols(batch):
            src = [nfkc(x.get(SRC,"")) for x in batch["translation"]]
            tgt = [nfkc(x.get(TGT,"")) for x in batch["translation"]]
            return {"src_txt": src, "tgt_txt": tgt}
        return dset.map(
            to_cols,
            batched=True,
            batch_size=4096,
            desc="[normalize] add src_txt/tgt_txt from translation",
        )
    else:
        raise KeyError("Lo split non ha né src_txt/tgt_txt né translation.")

train_norm = ensure_src_tgt_cols(train_filt)
val_norm   = ensure_src_tgt_cols(val_filt)
test_norm = ensure_src_tgt_cols(test_filt)

LANG_NAME = {"de": "German", "en": "English"}
def lang_name(code: str) -> str:
    return LANG_NAME.get(code.lower(), code.upper())

MT_PROMPT = """Translate the following sentence from {src_name} to {tgt_name}.
Source ({src_code}):
{src}
Translation:
{tgt}"""

GEN_PROMPT = """Translate the following sentence from {src_name} to {tgt_name}.
Source ({src_code}):
{src}
Translation:
"""

EOS = tokenizer.eos_token or ""

def add_lang_cols(dset, src_code=SRC, tgt_code=TGT):
    if dset is None: return None
    return dset.map(lambda ex: {"src_lang": src_code, "tgt_lang": tgt_code},
                    desc="Adding language columns")

train_with_lang = add_lang_cols(train_norm)
val_with_lang   = add_lang_cols(val_norm)

def _clean(s: str) -> str:
    return (s or "").strip()

def format_for_sft(batch):
    srcs, tgts = batch["src_txt"], batch["tgt_txt"]
    slc, tlc   = batch["src_lang"], batch["tgt_lang"]
    texts = []
    for s, t, sc, tc in zip(srcs, tgts, slc, tlc):
        s1, t1 = _clean(s), _clean(t)
        if not s1 or not t1:
            texts.append("")
            continue
        txt = MT_PROMPT.format(
            src_name = lang_name(sc),
            tgt_name = lang_name(tc),
            src_code = sc,
            src = s1,
            tgt = t1,
        ) + EOS
        texts.append(txt)
    return {"text": texts}

raw_small = DatasetDict({k:v for k,v in {
    "train": train_with_lang,
    "validation": val_with_lang,
}.items() if v is not None})

train_ds = raw_small["train"].map(
    format_for_sft, batched=True, remove_columns=raw_small["train"].column_names,
    desc="[format] train"
).filter(lambda ex: len(ex["text"]) > 0)

val_ds = None
if "validation" in raw_small:
    val_ds = raw_small["validation"].map(
        format_for_sft, batched=True, remove_columns=raw_small["validation"].column_names,
        desc="[format] val"
    ).filter(lambda ex: len(ex["text"]) > 0)

print("Train/Val sizes:", len(train_ds), (len(val_ds) if val_ds is not None else 0))


[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/3000 [00:00<?, ? examples/s]

[normalize] add src_txt/tgt_txt from translation:   0%|          | 0/3003 [00:00<?, ? examples/s]

Adding language columns:   0%|          | 0/62240 [00:00<?, ? examples/s]

Adding language columns:   0%|          | 0/3000 [00:00<?, ? examples/s]

[format] train:   0%|          | 0/62240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/62240 [00:00<?, ? examples/s]

[format] val:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Train/Val sizes: 62240 3000


# SFT

In [None]:
# === SFT with TRL + W&B (Qwen2.5-1.5B, QLoRA) — early stopping, best model, resume ===
import os, time, wandb, torch
from trl import SFTTrainer, SFTConfig
from transformers import EarlyStoppingCallback

# Special tokens
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# W&B
os.environ["WANDB_PROJECT"]   = "euromt"
os.environ["WANDB_NAME"]      = f"qwen25-1p5b-sft_{len(train_ds)}_{int(time.time())}"
os.environ["WANDB_WATCH"]     = "false"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_SILENT"]    = "true"
wandb.login()
wandb.init(
    project="euromt",
    name="qwen25-1p5b-wmt14-de-en-sft",
    config={
        "model": MODEL_ID,
        "max_seq_len": 256,
        "use_4bit": True,
        "lora_r": 8,
        "train_rows": len(train_ds),
    },
)

MAX_SEQ    = 256
PER_DEV_BS = 64
GRAD_ACCUM = 1

USE_EPOCHS = True
NUM_TRAIN_EPOCHS = 2
MAX_STEPS  = -1           # ignored if USE_EPOCHS=True

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    dataset_text_field="text",
    response_template="Translation:\n",
    max_seq_length=MAX_SEQ,
    packing=False,
    args=SFTConfig(
        output_dir="outputs_sft",
        report_to=["wandb"],

        # Batch & steps
        per_device_train_batch_size=PER_DEV_BS,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=(NUM_TRAIN_EPOCHS if USE_EPOCHS else 1),
        max_steps=(MAX_STEPS if not USE_EPOCHS else -1),

        # Opt & sched (stabili)
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        weight_decay=0.0,
        #label_smoothing_factor=0.1,
        max_grad_norm=1.0,
        optim="adamw_8bit",

        # Dataloader
        #group_by_length=True,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,

        # Log, eval, save
        logging_steps=20,
        eval_strategy="steps" if val_ds is not None else "no",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=3,

        # Early stopping + best model
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        load_best_model_at_end=True,

        seed=3407,
    ),
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.0
        )
    ],
)

print("Train/Val sizes:", len(train_ds), (0 if val_ds is None else len(val_ds)))

# Riprendi automaticamente se c'è un checkpoint
resume = False
trainer.train(resume_from_checkpoint=resume)


Train/Val sizes: 62240 3000


Step,Training Loss,Validation Loss
200,1.7853,1.842218
400,1.7437,1.832611
600,1.7465,1.83013
800,1.736,1.828865
1000,1.7163,1.828089
1200,1.7185,1.826324
1400,1.7063,1.827058
1600,1.6895,1.826589
1800,1.7138,1.827271


TrainOutput(global_step=1800, training_loss=1.7281786198086209, metrics={'train_runtime': 17314.1795, 'train_samples_per_second': 7.189, 'train_steps_per_second': 0.112, 'total_flos': 2.334298864783196e+17, 'train_loss': 1.7281786198086209, 'epoch': 1.8499486125385407})

# Saving best checkpoint

In [None]:
# === Save on Google Drive the BEST (or LAST) checkpoint ===
from google.colab import drive
drive.mount("/content/drive")

import os, shutil, glob, time

OUT_DIR = "outputs_sft"  # deve coincidere con SFTConfig.output_dir

def pick_checkpoint(out_dir):
    # 1) try taking best (if load_best_model_at_end=True)
    best = getattr(trainer.state, "best_model_checkpoint", None)
    if best and os.path.isdir(best):
        return best, "best"
    # 2) fallback: last checkpoint saved
    ckpts = sorted(glob.glob(os.path.join(out_dir, "checkpoint-*")), key=os.path.getmtime)
    if ckpts:
        return ckpts[-1], "last"
    return None, None

ckpt_path, kind = pick_checkpoint(OUT_DIR)
assert ckpt_path is not None, f"No checkpoint found in: {OUT_DIR}"

stamp = time.strftime("%Y%m%d_%H%M%S")
GDRIVE_DIR = f"/content/drive/MyDrive/mt_checkpoints/qwen25_sft_{kind}_{stamp}"

shutil.copytree(ckpt_path, GDRIVE_DIR)
tokenizer.save_pretrained(GDRIVE_DIR)

print(f"✔️ Copiato {kind.upper()} checkpoint in:\n{GDRIVE_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✔️ Copiato BEST checkpoint in:
/content/drive/MyDrive/mt_checkpoints/qwen25_sft_best_20250909_000610
