In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/TempoRun2025/dataset.zip


Archive:  /content/drive/MyDrive/TempoRun2025/dataset.zip
   creating: dataset/
   creating: dataset/public/
  inflating: dataset/public/105a4de8.json  
  inflating: dataset/public/1a150873.json  
  inflating: dataset/public/c5087e0f.json  
  inflating: dataset/public/dce587aa.json  
  inflating: dataset/public/d6110501.json  
  inflating: dataset/public/eceac1a3.json  
  inflating: dataset/public/27d7feb8.json  
  inflating: dataset/public/0dc43831.json  
  inflating: dataset/public/669891d0.json  
  inflating: dataset/public/9c4ba5ca.json  
  inflating: dataset/public/18539150.json  
  inflating: dataset/public/471aaf0c.json  
  inflating: dataset/public/7b90e8a1.json  
  inflating: dataset/public/9ada8586.json  
  inflating: dataset/public/af40fa87.json  
  inflating: dataset/public/2edcf272.json  
  inflating: dataset/public/7f4e9e97.json  
  inflating: dataset/public/73743c6c.json  
  inflating: dataset/public/fa3593f3.json  
  inflating: dataset/public/a2aa2f73.json  
  inflating

In [3]:
!pip install transformers datasets accelerate peft bitsandbytes trl
!pip install sentencepiece protobuf

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.47.0 trl-0.23.0


In [30]:
# -*- coding: utf-8 -*-
"""
Unified evaluator for Qwen2.5 MCQ SFT (baseline / LoRA)
- Colab-safe (no flash-attn required)
- Supports:
    --model_type baseline | lora
    --mode logits | generate
- Batch inference, JSONL details, accuracy, confusion matrix
"""

import os
import re
import json
import glob
import time
import math
from collections import Counter, defaultdict

import torch
from tqdm import tqdm
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

try:
    from peft import PeftModel
    PEFT_AVAILABLE = True
except ImportError:
    PEFT_AVAILABLE = False


# ---------------------------
# Defaults (edit as needed)
# ---------------------------
DEFAULT_BASE = "Qwen/Qwen2.5-0.5B-Instruct"


def safe_attention_impl():
    """Always return an attention implementation that works on Colab free tier."""
    return "eager"


def build_user_instruction(title, content, question, choices_dict):
    # deterministic A-D order if present
    ordered = []
    for k in ["A", "B", "C", "D"]:
        if k in choices_dict:
            ordered.append(f"{k}: {choices_dict[k]}")
    if not ordered:
        ordered = [f"{k}: {v}" for k, v in choices_dict.items()]

    choices_text = "\n".join(ordered)

    instruction = (
        "Bạn là hệ thống trả lời trắc nghiệm. Hãy đọc văn bản và câu hỏi, "
        "chỉ chọn **một đáp án duy nhất** từ A/B/C/D, không giải thích, không thêm nội dung khác.\n\n"
        f"Văn bản:\nTiêu đề: {title}\n\nNội dung: {content}\n\n"
        f"Câu hỏi:\n{question}\n\n"
        f"Các lựa chọn:\n{choices_text}\n\n"
        "Chỉ trả lời đúng 1 ký tự: A, B, C hoặc D."
    )
    return instruction


def load_json_files(dataset_path):
    files = glob.glob(os.path.join(dataset_path, "*.json"))
    print(f"Found {len(files)} JSON files in '{dataset_path}'")
    items = []
    for p in files:
        try:
            with open(p, "r", encoding="utf-8") as f:
                items.append(json.load(f))
        except Exception as e:
            print(f"Error reading {p}: {e}")
    return items


def prepare_eval_rows(dataset_path):
    """Return a list of dicts: {instruction, label} for MCQ."""
    data_list = load_json_files(dataset_path)
    rows = []
    dropped = 0
    for data in data_list:
        content = data.get("content:") or data.get("content", "")
        title = data.get("title:") or data.get("title", "")
        questions = data.get("questions", [])
        if not content:
            dropped += 1
            continue

        for q in questions:
            question = (q.get("question", "") or "").strip()
            choices = q.get("choices", {}) or {}
            gt = (q.get("correct_answer", "") or "").strip().upper()[:1]
            if gt not in ["A", "B", "C", "D"]:
                continue
            instruction = build_user_instruction(title, content, question, choices)
            rows.append({"instruction": instruction, "label": gt})
    print(f"Prepared {len(rows)} eval rows (dropped {dropped} docs without content).")
    return rows


def load_model_and_tokenizer(
    model_type: str,
    base_model_name: str,
    adapter_path: str = None,
    use_4bit: bool = True,
    bf16: bool = torch.cuda.is_bf16_supported(),
):
    """
    model_type: 'baseline' | 'lora'
    baseline: load base model (optionally 4bit)
    lora:     load base (4bit) + attach LoRA from adapter_path
    """
    if model_type not in ["baseline", "lora"]:
        raise ValueError(f"model_type must be 'baseline' or 'lora', but got '{model_type}'")

    attn_impl = safe_attention_impl()

    # baseline or lora: use 4-bit by default to save VRAM
    bnb = BitsAndBytesConfig(
        load_in_4bit=bool(use_4bit),
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if bf16 else torch.float16,
        bnb_4bit_use_double_quant=True,
    ) if use_4bit else None

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16 if bf16 else torch.float16,
        attn_implementation=attn_impl,
        trust_remote_code=True,
        quantization_config=bnb if use_4bit else None,
    )
    tok_src = base_model_name

    if model_type == "lora":
        if not PEFT_AVAILABLE:
            raise RuntimeError("peft is not installed. Please run: pip install peft")
        if not adapter_path:
            raise ValueError("--adapter_path is required for model_type=lora")
        print(f"Loading LoRA adapter from: {adapter_path}")
        model = PeftModel.from_pretrained(model, adapter_path)

    tokenizer = AutoTokenizer.from_pretrained(tok_src, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print("Loaded model config from:", getattr(model.config, "_name_or_path", None))

    # If using LoRA, print details
    if model_type == 'lora' and isinstance(model, PeftModel):
        print("PEFT adapter attached. Active adapters:", getattr(model, "active_adapters", None))
        model.print_trainable_parameters()

    model.eval()
    return model, tokenizer


def to_chat_prompt(tokenizer, instruction: str):
    messages = [
        {"role": "system", "content": "Bạn là hệ thống trả lời trắc nghiệm. Chỉ xuất duy nhất 1 ký tự A/B/C/D."},
        {"role": "user", "content": instruction},
    ]
    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


def parse_generated(text: str):
    # Robust A-D extraction
    m = re.search(r"(?:Đáp án|Trả lời|Câu\s*trả\s*lời)\s*:?\s*([A-D])", text, flags=re.IGNORECASE)
    if m: return m.group(1).upper()
    m = re.search(r"\b([A-D])[:.]?\b", text) # Make colon/period optional
    if m: return m.group(1).upper()
    m = re.search(r"\b([A-D])\b", text)
    if m: return m.group(1).upper()
    return None


def get_letter_token_ids(tokenizer):
    """Map letters to token IDs; if any letter is not single-token, return None to disable logits mode."""
    letter_ids = {}
    for ch in ["A", "B", "C", "D"]:
        ids = tokenizer(ch, add_special_tokens=False).input_ids
        if len(ids) != 1:
            print(f"Warning: Letter '{ch}' is not a single token. Logits mode might be inaccurate. Fallback to generate recommended.")
            return None
        letter_ids[ch] = ids[0]
    return letter_ids


def eval_logits_mode(model, tokenizer, prompts, labels, max_length):
    """
    One forward pass: pick next-token argmax over {A,B,C,D}.
    Fast & stable if letters are single-token.
    """
    letter_map = get_letter_token_ids(tokenizer)
    if letter_map is None:
        return None  # signal to caller to fall back to generate

    # Tokenize batch
    enc = tokenizer(
        prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length
    ).to(model.device)

    with torch.no_grad():
        out = model(**enc)
        # logits shape: [B, T, V]; focus on last valid token per item
        logits = out.logits
        # gather last non-pad position for each sequence
        last_pos = enc.attention_mask.sum(dim=1) - 1
        batch_idx = torch.arange(logits.size(0), device=logits.device)
        next_logits = logits[batch_idx, last_pos, :]

        # score only {A,B,C,D}
        cand_ids = torch.tensor(list(letter_map.values()), device=logits.device)
        cand_logits = next_logits[:, cand_ids]
        pred_idx = cand_logits.argmax(dim=1)

        idx2letter = list(letter_map.keys())
        preds = [idx2letter[i.item()] for i in pred_idx]

    details = []
    correct = 0
    for p, g in zip(preds, labels):
        ok = (p == g)
        correct += int(ok)
        details.append({"pred": p, "label": g, "is_correct": ok})
    return details, correct


def eval_generate_mode(model, tokenizer, prompts, labels, max_length):
    """Greedy short generation; parse first A–D found."""
    inputs = tokenizer(
        prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=4, # A bit more generous for safety
            temperature=0.0,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    gens = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    details = []
    correct = 0
    for g, lab in zip(gens, labels):
        pred = parse_generated(g) or "E"  # Use 'E' (Error) for fallback
        ok = (pred == lab)
        correct += int(ok)
        details.append({"pred": pred, "label": lab, "raw": g.strip(), "is_correct": ok})
    return details, correct


def run_eval(
    model,
    tokenizer,
    rows,
    batch_size=16,
    max_length=4096,
    mode="logits",
    out_jsonl="eval_details.jsonl",
):
    total = len(rows)
    correct = 0
    all_details = []
    start = time.time()

    # Build prompts in advance (chat template)
    prompts = [to_chat_prompt(tokenizer, r["instruction"]) for r in rows]
    labels = [r["label"] for r in rows]

    # Iterate in batches
    for i in tqdm(range(0, total, batch_size), desc="Evaluating"):
        ps = prompts[i:i+batch_size]
        ls = labels[i:i+batch_size]
        details, corr = None, 0

        if mode == "logits":
            res = eval_logits_mode(model, tokenizer, ps, ls, max_length)
            if res is None:
                print("Logits mode failed (multi-token letters), falling back to generate mode.")
                mode = "generate" # Switch mode for subsequent batches

        if mode == "generate":
             res = eval_generate_mode(model, tokenizer, ps, ls, max_length)

        details, corr = res
        all_details.extend(details)
        correct += corr

    elapsed = time.time() - start
    acc = 100.0 * correct / total if total else 0.0

    # Compute confusion & distribution
    cm = defaultdict(lambda: Counter())
    dist = Counter()
    for d in all_details:
        dist[d["pred"]] += 1
        cm[d["label"]][d["pred"]] += 1

    # Save JSONL (one line per item)
    with open(out_jsonl, "w", encoding="utf-8") as f:
        for d in all_details:
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

    # Pretty print summary
    print("\n=== EVAL SUMMARY ===")
    print(f"Total items: {total}")
    print(f"Correct:     {correct}")
    print(f"Accuracy:    {acc:.2f}%")
    print(f"Time:        {elapsed:.2f}s")
    print(f"Items/sec:   {total/elapsed:.2f}")
    print(f"Output saved to: {out_jsonl}")
    print("\nPredicted distribution:", dict(dist))
    print("\nConfusion matrix (rows=label, cols=pred):")
    letters = sorted(list(set(cm.keys()) | set(dist.keys())))
    header = "     " + "  ".join([f"{c:>5}" for c in letters])
    print(header)
    for lab in letters:
        if lab not in cm: continue
        row = [cm[lab][p] for p in letters]
        print(f"{lab:>3}: " + "  ".join([f"{n:>5}" for n in row]))

    return {
        "total": total,
        "correct": correct,
        "accuracy": acc,
        "time_sec": elapsed,
        "items_per_sec": total/elapsed if elapsed > 0 else 0.0,
        "pred_dist": dict(dist),
        "confusion": {lab: dict(cm[lab]) for lab in cm},
    }


# ===================================================================================
# MODIFIED FOR COLAB: Direct function call instead of argparse
# ===================================================================================

def run_colab_evaluation(
    eval_dataset_path: str,
    model_type: str = "baseline",
    base_model: str = DEFAULT_BASE,
    adapter_path: str = None,
    batch_size: int = 16,
    max_length: int = 1024,
    mode: str = "logits",
    use_4bit: bool = True,
    output_jsonl: str = "eval_details.jsonl"
):
    """
    Main function to run the entire evaluation pipeline for baseline or LoRA models.
    Call this function directly from your Colab notebook.
    """
    # Prepare data
    rows = prepare_eval_rows(eval_dataset_path)
    if not rows:
        print("No eval data found. Exiting.")
        return

    # Load model/tokenizer
    bf16 = torch.cuda.is_bf16_supported()
    model, tokenizer = load_model_and_tokenizer(
        model_type=model_type,
        base_model_name=base_model,
        adapter_path=adapter_path,
        use_4bit=use_4bit,
        bf16=bf16,
    )

    # Run eval
    run_eval(
        model=model,
        tokenizer=tokenizer,
        rows=rows,
        batch_size=batch_size,
        max_length=max_length,
        mode=mode,
        out_jsonl=output_jsonl,
    )


# ===================================================================================
# EXAMPLE USAGE IN COLAB
# ===================================================================================
if __name__ == "__main__":
    # This block will run if you execute the script directly.
    # In a Colab notebook, you can copy, paste, and modify these lines into a cell.

    # --- STEP 1: UPLOAD YOUR DATA ---
    # Make sure your evaluation JSON files are in a folder.
    # For example, create a folder named 'my_eval_data' in your Colab environment
    # and upload your files there.

    # --- STEP 2: CONFIGURE AND RUN ---
    # Choose the configuration that matches your model.

    # === EXAMPLE 1: Evaluating the baseline Qwen2.5 model ===
    print("--- Running evaluation for BASELINE model ---")
    run_colab_evaluation(
        eval_dataset_path="/content/dataset/public",  # <-- IMPORTANT: Change to your data folder
        model_type="baseline",
        base_model="Qwen/Qwen2.5-0.5B-Instruct",
        use_4bit=True, # Use 4-bit quantization to save memory
        batch_size=8,
        mode="logits", # 'logits' is faster, 'generate' is more robust
        output_jsonl="baseline_eval_details.jsonl"
    )

    # === EXAMPLE 2: Evaluating a LoRA model ===
    # print("\n\n--- Running evaluation for LoRA model ---")
    # run_colab_evaluation(
    #     eval_dataset_path="/content/dataset/public",    # <-- IMPORTANT: Change to your data folder
    #     model_type="lora",
    #     base_model="Qwen/Qwen2.5-0.5B-Instruct", # The base model used for training the LoRA
    #     adapter_path="/content/qwen_legal_lora_sft", # <-- IMPORTANT: Change to your LoRA adapter folder
    #     use_4bit=True,
    #     batch_size=8,
    #     mode="logits",
    #     output_jsonl="lora_eval_details.jsonl"
    # )

    print("\nScript loaded. To run an evaluation in a Colab notebook,")
    print("copy one of the example blocks above into a new cell,")
    print("modify the paths, and run the cell.")



--- Running evaluation for MERGED model ---
Found 200 JSON files in '/content/dataset/public'
Prepared 597 eval rows (dropped 1 docs without content).


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
import re
import glob
import json
import random
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training,
)
from trl import SFTTrainer, SFTConfig
from sklearn.model_selection import train_test_split

# -----------------------------
# Device / backend preferences
# -----------------------------
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# -----------------------------
# Configuration
# -----------------------------
class Config:
    # Base model
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"

    # I/O
    dataset_path = "/content/dataset/train"
    output_dir = "./qwen_legal_lora_sft"

    # Sequence & batching
    max_seq_length = 1024          # raise if VRAM allows; 1024 if tight
    per_device_train_batch_size = 2
    gradient_accumulation_steps = 4

    # Steps / schedule
    max_steps = 70                # quick run; raise for better quality
    warmup_ratio = 0.06
    learning_rate = 2e-4
    weight_decay = 0.01
    lr_scheduler_type = "cosine"
    logging_steps = 10
    max_grad_norm = 0.3
    seed = 3407

    # Precision
    fp16 = not torch.cuda.is_bf16_supported()
    bf16 = torch.cuda.is_bf16_supported()

    # Data split
    test_size = 0.1

    # LoRA
    lora_r = 32
    lora_alpha = 16
    lora_dropout = 0.05
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]

    # Optimizer (bitsandbytes)
    optim = "adamw_bnb_8bit"

random.seed(Config.seed)

# -----------------------------
# Data loading & preprocessing
# -----------------------------
def load_json_files(dataset_path):
    """Load all JSON files from the dataset directory."""
    json_files = glob.glob(os.path.join(dataset_path, "*.json"))
    print(f"Found {len(json_files)} JSON files")

    all_data = []
    for file_path in json_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                all_data.append(data)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    return all_data

def build_text_sample(tokenizer, instruction, correct_answer):
    """
    Create a single training string using the tokenizer's chat template.
    We supervise on a single letter A/B/C/D (+ EOS).
    """
    # System prompt: keep it short and strict
    system_msg = "Bạn là hệ thống trả lời trắc nghiệm. Chỉ xuất duy nhất 1 ký tự A/B/C/D."

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": instruction},
        {"role": "assistant", "content": correct_answer.strip()[:1]},  # "A"/"B"/"C"/"D"
    ]
    # Returns the whole conversation with assistant turn included.
    # For SFT, that's perfect because TRL will compute loss on assistant tokens.
    chat_str = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    # Ensure EOS at the end for stable termination
    if not chat_str.endswith(tokenizer.eos_token or ""):
        chat_str += (tokenizer.eos_token or "")
    return chat_str

def create_training_samples(data_list, tokenizer):
    """Convert JSON data to training samples, handling multiple key formats."""
    samples = []
    dropped = 0

    for data in data_list:
        content = data.get("content:") or data.get("content", "")
        title = data.get("title:") or data.get("title", "")
        questions = data.get("questions", [])

        if not content:
            dropped += 1
            print("\n--- BỎ QUA: Thiếu content ---")
            problematic_data = {k: v for k, v in data.items() if k not in ["content", "content:"]}
            print(problematic_data)
            continue

        context = f"Tiêu đề: {title}\n\nNội dung: {content}"

        for q in questions:
            question = q.get("question", "").strip()
            choices = q.get("choices", {})
            correct_answer = (q.get("correct_answer", "") or "").strip()[:1].upper()

            if correct_answer not in ["A", "B", "C", "D"]:
                continue

            # Present choices in a deterministic order A-D if present
            ordered = []
            for key in ["A", "B", "C", "D"]:
                if key in choices:
                    ordered.append(f"{key}: {choices[key]}")
            choices_text = "\n".join(ordered) if ordered else "\n".join([f"{k}: {v}" for k, v in choices.items()])

            instruction = (
                "Bạn là hệ thống trả lời trắc nghiệm. Hãy đọc văn bản và câu hỏi, "
                "chỉ chọn **một đáp án duy nhất** từ A/B/C/D, không giải thích, không thêm nội dung khác.\n\n"
                f"Văn bản:\n{context}\n\n"
                f"Câu hỏi:\n{question}\n\n"
                f"Các lựa chọn:\n{choices_text}\n\n"
                "Chỉ trả lời đúng 1 ký tự: A, B, C hoặc D."
            )

            text = build_text_sample(tokenizer, instruction, correct_answer)
            samples.append({"text": text})

    print(f"Created {len(samples)} samples (dropped {dropped} docs without content).")
    return samples

def prepare_dataset(dataset_path, tokenizer, test_size=0.1):
    """Prepare dataset: load → build → dedup → split → HuggingFace Dataset."""
    data_list = load_json_files(dataset_path)
    print(f"Loaded {len(data_list)} documents")

    samples = create_training_samples(data_list, tokenizer)
    df = pd.DataFrame(samples).drop_duplicates(subset=["text"]).reset_index(drop=True)
    train_df, val_df = train_test_split(
        df, test_size=test_size, random_state=Config.seed, shuffle=True
    )
    train_ds = Dataset.from_pandas(train_df)
    val_ds = Dataset.from_pandas(val_df)
    print(f"Train: {len(train_ds)} | Validation: {len(val_ds)}")
    return {"train": train_ds, "eval": val_ds}

# -----------------------------
# Model / Tokenizer setup
# -----------------------------
def setup_model_and_tokenizer():
    """Setup model & tokenizer with 4-bit QLoRA and fast attention."""
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if Config.bf16 else torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(Config.model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Try FlashAttention-2 if available; silently fall back otherwise
    attn_impl = "eager"   # or just remove attn_implementation argument

    model = AutoModelForCausalLM.from_pretrained(
        Config.model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16 if Config.bf16 else torch.float16,
        attn_implementation=attn_impl,
        trust_remote_code=True,
    )

    # Gradient checkpointing + input grads
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
    try:
        model.enable_input_require_grads()
    except Exception:
        pass

    return model, tokenizer

def setup_lora_config():
    """LoRA configuration tuned for small Qwen2.5."""
    return LoraConfig(
        r=Config.lora_r,
        lora_alpha=Config.lora_alpha,
        target_modules=Config.target_modules,
        lora_dropout=Config.lora_dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

# -----------------------------
# Training
# -----------------------------
def train_model():
    print("Loading model & tokenizer...")
    model, tokenizer = setup_model_and_tokenizer()

    print("Preparing dataset...")
    dataset = prepare_dataset(Config.dataset_path, tokenizer, Config.test_size)

    print("Setting up LoRA...")
    lora_config = setup_lora_config()
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    print("Configuring SFT...")
    sft_config = SFTConfig(
        output_dir=Config.output_dir,
        per_device_train_batch_size=Config.per_device_train_batch_size,
        gradient_accumulation_steps=Config.gradient_accumulation_steps,
        max_steps=Config.max_steps,
        learning_rate=Config.learning_rate,
        fp16=Config.fp16,
        bf16=Config.bf16,
        logging_steps=Config.logging_steps,
        optim=Config.optim,                   # adamw_bnb_8bit
        weight_decay=Config.weight_decay,
        lr_scheduler_type=Config.lr_scheduler_type,  # cosine
        warmup_ratio=Config.warmup_ratio,
        max_grad_norm=Config.max_grad_norm,
        seed=Config.seed,
        dataset_text_field="text",
        packing=True,                         # big speedup for short turns
        save_safetensors=True,
        eval_strategy="steps",
        eval_steps=50,                        # quick feedback during short runs
        save_steps=100,
        logging_first_step=True,
    )

    print("Initializing trainer...")
    trainer = SFTTrainer(
        model=model,
        args=sft_config,
        train_dataset=dataset["train"],
        eval_dataset=dataset["eval"],
        processing_class=tokenizer,
    )

    print("Starting training...")
    trainer.train()

    print("Saving model & tokenizer...")
    trainer.save_model()
    tokenizer.save_pretrained(Config.output_dir)

    return tokenizer, model

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    print("Ensure your dataset is uploaded to /content/dataset/train")
    tokenizer, model = train_model()


CUDA available: True
GPU: Tesla T4
Memory: 15.8 GB
Ensure your dataset is uploaded to /content/dataset/train
Loading model & tokenizer...
Preparing dataset...
Found 1500 JSON files
Loaded 1500 documents

--- BỎ QUA: Thiếu content ---
{'url': 'https://kenh14.vn/loai-cay-ty-do-giup-viet-nam-tro-thanh-ong-trum-xuat-khau-dung-thu-2-the-gioi-nang-suat-dan-dau-toan-cau-tinh-nao-trong-nhieu-nhat-215250130130240016.chn', 'title': '', 'metadata': [['https://kenh14cdn.com/203336854389633024/2025/1/30/cafe-thumb-1738216786511-17382167873341433530892.jpg', "Loại cây tỷ đô giúp Việt Nam trở thành 'ông trùm' xuất khẩu đứng thứ 2 thế giới: Năng suất dẫn đầu toàn cầu, tỉnh nào trồng nhiều nhất?- Ảnh 1."], ['https://kenh14cdn.com/203336854389633024/2025/1/30/cafe-1-1738216788125-17382167882951621571710.jpg', "Loại cây tỷ đô giúp Việt Nam trở thành 'ông trùm' xuất khẩu đứng thứ 2 thế giới: Năng suất dẫn đầu toàn cầu, tỉnh nào trồng nhiều nhất?- Ảnh 2."]]}

--- BỎ QUA: Thiếu content ---
{'url': 'https://

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


trainable params: 17,596,416 || all params: 511,629,184 || trainable%: 3.4393
Configuring SFT...
Initializing trainer...


Adding EOS to train dataset:   0%|          | 0/4041 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4041 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/4041 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,2.5783,2.299219,2.194419,384856.0,0.54454


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Saving model & tokenizer...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Caching is incompatible with gradient checkpointing in Qwen2DecoderLayer. Setting `past_key_values=None`.



Testing the fine-tuned model (greedy single-letter)...


  return fn(*args, **kwargs)


RuntimeError: expected scalar type Float but found BFloat16

In [22]:
from google.colab import drive
drive.mount('/content/drive')

# !mkdir -p /content/drive/MyDrive/qwen_runs
# !cp -r ./qwen_legal_lora_sft /content/drive/MyDrive/qwen_runs/
!cp /content/lora_eval_details.jsonl /content/drive/MyDrive/qwen_runs/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: cannot stat './qwen_legal_lora_sft_merged': No such file or directory
