## Training ALLaM using LoRA + Tag, 15EPOCH

In [None]:
import os, sys, json, time, random, numpy as np, torch, transformers
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from transformers import set_seed

# -----------------------------
# ENV + Reproducibility
# -----------------------------
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
SEED = 42
set_seed(SEED)
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
torch.cuda.empty_cache()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("="*60)
print("ENVIRONMENT CHECK")
print(f"Python executable: {sys.executable}")
print(f"Python version   : {sys.version}")
print(f"Torch version    : {torch.__version__}")
print(f"Transformers ver : {transformers.__version__}")
print(f"CUDA available   : {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No CUDA device detected!")
try:
    import datasets; print(f"Datasets version : {datasets.__version__}")
except Exception: print("Datasets not installed?")
try:
    import peft; print(f"PEFT version     : {peft.__version__}")
except Exception: print("PEFT not installed?")
print("="*60)

# -----------------------------
# PATHS / MODEL
# -----------------------------
TRAIN_PATH = Path("data_splits/train.jsonl")
DEV_PATH   = Path("data_splits/dev.jsonl")
OUTPUT_DIR = Path("outputs/allam7b-lora-token-15EPOCH")
BASE_MODEL = "ALLaM-AI/ALLaM-7B-Instruct-preview"
MAX_SEQ_LEN = 2048 

# -----------------------------
# LoRA config
# -----------------------------
lora_config = LoraConfig(
    r=32,  
    lora_alpha=64,  
    lora_dropout=0.1,  
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

# -----------------------------
# Training args (TRL SFTConfig; TRL==0.9.6)
# ----------------------------

train_args = SFTConfig(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=15,  
    per_device_train_batch_size=2, 
    gradient_accumulation_steps=8,  
    learning_rate=5e-5,  
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.1,  
    bf16=True,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    gradient_checkpointing=True,
    report_to="tensorboard",
    save_total_limit=3,  # Keep more checkpoints
    seed=SEED,
    packing=True,
    dataset_text_field="text",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # Add these for better logging
    logging_dir=str(OUTPUT_DIR / "logs"),
    logging_first_step=True,
    remove_unused_columns=False,
)
# -----------------------------
# Data IO
# -----------------------------
def load_jsonl(path: Path) -> Dataset:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            obj = json.loads(line)
            instr = (obj.get("instruction") or "").strip()
            resp  = (obj.get("response") or "").strip()
            if instr and resp:
                rows.append({"instruction": instr, "response": resp})
    if not rows:
        raise ValueError(f"No valid rows found in {path}")
    return Dataset.from_list(rows)

train_ds = load_jsonl(TRAIN_PATH)
dev_ds   = load_jsonl(DEV_PATH)

EOS = "</s>"
def fmt(example):
    instr = example["instruction"] 
    resp  = example["response"]
    text = f"### Instruction:\n{instr}\n\n### Response:\n{resp}{EOS}"
    return {"text": text}

train_text = train_ds.map(fmt, remove_columns=[c for c in train_ds.column_names if c!="text"])
dev_text   = dev_ds.map(fmt,   remove_columns=[c for c in dev_ds.column_names if c!="text"])

print(f"Train examples: {len(train_text):,} | Dev examples: {len(dev_text):,}")

# -----------------------------
# Tokenizer & Base model
# -----------------------------
tok = AutoTokenizer.from_pretrained(BASE_MODEL)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.config.use_cache = False

# -----------------------------
# Step logging callback
# -----------------------------
class LoggingCallback(TrainerCallback):
    def __init__(self):
        self.t0 = time.time()
        self.tlast = self.t0

    def on_step_end(self, args, state, control, logs=None, **kwargs):
        if not state.is_local_process_zero:
            return
        logs = logs or {}
        now = time.time()
        train_loss = logs.get("loss")
        if train_loss is not None:
            print(
                f"[Step {state.global_step}] epoch={state.epoch:.2f} "
                f"train_loss={train_loss:.4f} "
                f"lr={logs.get('learning_rate', 'N/A'):.2e} "
                f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                flush=True
            )
            self.tlast = now

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not state.is_local_process_zero:
            return
        metrics = metrics or {}
        now = time.time()
        
        eval_loss = metrics.get("eval_loss")
        if eval_loss is not None:
            print(
                f"[EVAL] epoch={state.epoch:.2f} "
                f"eval_loss={eval_loss:.4f} "
                f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                flush=True
            )
            self.tlast = now

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not state.is_local_process_zero:
            return
        logs = logs or {}
        now = time.time()
        
        # Log any metrics that weren't caught above
        if "loss" in logs or "eval_loss" in logs:
            train_loss = logs.get("loss", "N/A")
            eval_loss = logs.get("eval_loss", "N/A")
            lr = logs.get("learning_rate", "N/A")
            
            print(
                f"[LOG] epoch={state.epoch:.2f} "
                f"train_loss={train_loss} "
                f"eval_loss={eval_loss} "
                f"lr={lr} "
                f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                flush=True
            )
            self.tlast = now

# -----------------------------
# Trainer (TRL + LoRA)
# -----------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tok,  
    train_dataset=train_text,
    eval_dataset=dev_text,
    peft_config=lora_config,
    args=train_args,
    max_seq_length=MAX_SEQ_LEN,  
)


try:
    trainer.model.print_trainable_parameters()
except Exception:
    pass

trainer.add_callback(LoggingCallback())

print("Starting training…", flush=True)
trainer.train()

# -----------------------------
# Save adapters & tokenizer
# -----------------------------
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.save_model()
tok.save_pretrained(OUTPUT_DIR)
print("✅ Done. LoRA adapters saved to:", OUTPUT_DIR)

# -----------------------------
# Final dev evaluation + perplexity
# -----------------------------
metrics = trainer.evaluate()
print("Final evaluation metrics:", metrics)
if "eval_loss" in metrics and metrics["eval_loss"] is not None:
    try:
        ppl = float(np.exp(metrics["eval_loss"]))
        print(f"Perplexity: {ppl:.3f}")
    except Exception:
        pass

# -----------------------------
# Save run config snapshot
# -----------------------------
with open(OUTPUT_DIR / "run_config.txt", "w", encoding="utf-8") as f:
    f.write(f"Seed: {SEED}\n")
    f.write(f"Base model: {BASE_MODEL}\n")
    f.write(f"Train path: {TRAIN_PATH}\n")
    f.write(f"Dev path  : {DEV_PATH}\n")
    f.write(f"Epochs: {train_args.num_train_epochs}\n")
    f.write(f"LR: {train_args.learning_rate}\n")
    f.write(f"Per-device batch: {train_args.per_device_train_batch_size}\n")
    f.write(f"Grad accum: {train_args.gradient_accumulation_steps}\n")
    f.write(f"Max seq len: {MAX_SEQ_LEN}\n")
    f.write(f"BF16: {train_args.bf16}\n")
    f.write(f"LORA r/alpha/drop: {lora_config.r}/{lora_config.lora_alpha}/{lora_config.lora_dropout}\n")
    f.write(f"Transformers: {transformers.__version__}\n")
    f.write(f"Torch: {torch.__version__}\n")
print("Saved run_config.txt")


ENVIRONMENT CHECK
Python executable: /usr/bin/python
Python version   : 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
Torch version    : 2.2.1+cu121
Transformers ver : 4.42.3
CUDA available   : True
CUDA device count: 1
  Device 0: NVIDIA RTX 6000 Ada Generation
Datasets version : 2.19.1
PEFT version     : 0.11.1


Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Train examples: 5,122 | Dev examples: 588


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


trainable params: 79,953,920 || all params: 7,080,513,536 || trainable%: 1.1292
Starting training…




Epoch,Training Loss,Validation Loss
0,3.1216,2.714053
1,2.5936,2.235334
2,2.0492,2.025661
3,1.9103,1.952053
4,1.8315,1.909792
5,1.6996,1.895119
6,1.6455,1.892531
8,1.48,1.921903
9,1.4442,1.95204
10,1.3872,1.96545


[LOG] epoch=0.07 train_loss=3.3063 eval_loss=N/A lr=2.5e-06 dt=13.34s elapsed=0.22m
[LOG] epoch=0.72 train_loss=3.1216 eval_loss=N/A lr=2.5e-05 dt=119.87s elapsed=2.22m
[LOG] epoch=0.94 train_loss=N/A eval_loss=2.714052677154541 lr=N/A dt=58.16s elapsed=3.19m
[EVAL] epoch=0.94 eval_loss=2.7141 dt=0.00s elapsed=3.19m




[LOG] epoch=1.44 train_loss=2.5936 eval_loss=N/A lr=5e-05 dt=84.19s elapsed=4.59m
[LOG] epoch=1.95 train_loss=N/A eval_loss=2.2353343963623047 lr=N/A dt=110.34s elapsed=6.43m
[EVAL] epoch=1.95 eval_loss=2.2353 dt=0.00s elapsed=6.43m




[LOG] epoch=2.16 train_loss=2.2427 eval_loss=N/A lr=4.959823971496574e-05 dt=32.39s elapsed=6.97m
[LOG] epoch=2.88 train_loss=2.0492 eval_loss=N/A lr=4.8405871765993433e-05 dt=134.30s elapsed=9.21m
[LOG] epoch=2.95 train_loss=N/A eval_loss=2.025660991668701 lr=N/A dt=27.86s elapsed=9.67m
[EVAL] epoch=2.95 eval_loss=2.0257 dt=0.00s elapsed=9.67m




[LOG] epoch=3.60 train_loss=1.9103 eval_loss=N/A lr=4.6461219840046654e-05 dt=114.26s elapsed=11.58m
[LOG] epoch=3.96 train_loss=N/A eval_loss=1.9520529508590698 lr=N/A dt=80.01s elapsed=12.91m
[EVAL] epoch=3.96 eval_loss=1.9521 dt=0.00s elapsed=12.91m




[LOG] epoch=4.32 train_loss=1.8315 eval_loss=N/A lr=4.382678665009028e-05 dt=63.26s elapsed=13.97m
[LOG] epoch=4.97 train_loss=N/A eval_loss=1.9097920656204224 lr=N/A dt=131.91s elapsed=16.16m
[EVAL] epoch=4.97 eval_loss=1.9098 dt=0.00s elapsed=16.17m




[LOG] epoch=5.05 train_loss=1.7707 eval_loss=N/A lr=4.058724504646834e-05 dt=9.92s elapsed=16.33m
[LOG] epoch=5.77 train_loss=1.6996 eval_loss=N/A lr=3.6846716561824965e-05 dt=134.61s elapsed=18.57m
[LOG] epoch=5.98 train_loss=N/A eval_loss=1.8951194286346436 lr=N/A dt=49.86s elapsed=19.40m
[EVAL] epoch=5.98 eval_loss=1.8951 dt=0.00s elapsed=19.41m




[LOG] epoch=6.49 train_loss=1.6455 eval_loss=N/A lr=3.272542485937369e-05 dt=94.37s elapsed=20.98m
[LOG] epoch=6.99 train_loss=N/A eval_loss=1.892530918121338 lr=N/A dt=101.95s elapsed=22.68m
[EVAL] epoch=6.99 eval_loss=1.8925 dt=0.00s elapsed=22.68m




[LOG] epoch=7.21 train_loss=1.5687 eval_loss=N/A lr=2.8355831645441388e-05 dt=40.42s elapsed=23.35m
[LOG] epoch=7.93 train_loss=1.5355 eval_loss=N/A lr=2.3878379241237136e-05 dt=134.45s elapsed=25.59m
[LOG] epoch=8.00 train_loss=N/A eval_loss=1.9024438858032227 lr=N/A dt=19.46s elapsed=25.92m
[EVAL] epoch=8.00 eval_loss=1.9024 dt=0.00s elapsed=25.92m




[LOG] epoch=8.65 train_loss=1.48 eval_loss=N/A lr=1.9436976651092144e-05 dt=122.94s elapsed=27.96m
[LOG] epoch=8.94 train_loss=N/A eval_loss=1.9219032526016235 lr=N/A dt=71.56s elapsed=29.16m
[EVAL] epoch=8.94 eval_loss=1.9219 dt=0.00s elapsed=29.16m




[LOG] epoch=9.37 train_loss=1.4442 eval_loss=N/A lr=1.5174374208651912e-05 dt=70.75s elapsed=30.34m
[LOG] epoch=9.95 train_loss=N/A eval_loss=1.9520400762557983 lr=N/A dt=123.78s elapsed=32.40m
[EVAL] epoch=9.95 eval_loss=1.9520 dt=0.00s elapsed=32.40m




[LOG] epoch=10.09 train_loss=1.4222 eval_loss=N/A lr=1.122757546369744e-05 dt=18.77s elapsed=32.71m
[LOG] epoch=10.81 train_loss=1.3872 eval_loss=N/A lr=7.723433775328384e-06 dt=134.60s elapsed=34.96m
[LOG] epoch=10.95 train_loss=N/A eval_loss=1.9654499292373657 lr=N/A dt=41.36s elapsed=35.65m
[EVAL] epoch=10.95 eval_loss=1.9654 dt=0.00s elapsed=35.65m




[LOG] epoch=11.53 train_loss=1.3596 eval_loss=N/A lr=4.7745751406263165e-06 dt=100.96s elapsed=37.33m
[LOG] epoch=11.96 train_loss=N/A eval_loss=1.9782055616378784 lr=N/A dt=93.45s elapsed=38.89m
[EVAL] epoch=11.96 eval_loss=1.9782 dt=0.00s elapsed=38.89m




[LOG] epoch=12.25 train_loss=1.3666 eval_loss=N/A lr=2.475778302439524e-06 dt=48.66s elapsed=39.70m
[LOG] epoch=12.97 train_loss=1.3554 eval_loss=N/A lr=9.009284826036691e-07 dt=134.55s elapsed=41.94m
[LOG] epoch=12.97 train_loss=N/A eval_loss=1.9790353775024414 lr=N/A dt=11.06s elapsed=42.12m
[EVAL] epoch=12.97 eval_loss=1.9790 dt=0.00s elapsed=42.12m




[LOG] epoch=13.69 train_loss=1.3529 eval_loss=N/A lr=1.006426501190233e-07 dt=131.02s elapsed=44.31m
[LOG] epoch=13.98 train_loss=N/A eval_loss=1.9792847633361816 lr=N/A dt=63.19s elapsed=45.36m
[EVAL] epoch=13.98 eval_loss=1.9793 dt=0.00s elapsed=45.36m




[LOG] epoch=14.05 train_loss=N/A eval_loss=1.9791737794876099 lr=N/A dt=20.92s elapsed=45.71m
[EVAL] epoch=14.05 eval_loss=1.9792 dt=0.00s elapsed=45.71m
✅ Done. LoRA adapters saved to: outputs/allam7b-lora-token-15EPOCH


[LOG] epoch=14.05 train_loss=N/A eval_loss=1.892530918121338 lr=N/A dt=9.95s elapsed=45.87m
[EVAL] epoch=14.05 eval_loss=1.8925 dt=0.00s elapsed=45.87m
Final evaluation metrics: {'eval_loss': 1.892530918121338, 'eval_runtime': 6.8253, 'eval_samples_per_second': 3.663, 'eval_steps_per_second': 0.586, 'epoch': 14.054054054054054}
Perplexity: 6.636
Saved run_config.txt


In [2]:
print("Best eval_loss:", trainer.state.best_metric)
print("Best checkpoint path:", trainer.state.best_model_checkpoint)


Best eval_loss: 1.892530918121338
Best checkpoint path: outputs/allam7b-lora-token-15EPOCH/checkpoint-97


## Training ALLaM using LoRA + No Tag, 15EPOCH 

In [None]:
    # -----------------------------
    # PATHS / MODEL
    # -----------------------------
    TRAIN_PATH = Path("data_splits/train.jsonl")
    DEV_PATH   = Path("data_splits/dev.jsonl")
    OUTPUT_DIR = Path("outputs/allam7b-lora-no-token-15EPOCH")
    BASE_MODEL = "ALLaM-AI/ALLaM-7B-Instruct-preview"
    MAX_SEQ_LEN = 2048

    # -----------------------------
    # LoRA config
    # -----------------------------
    lora_config = LoraConfig(
        r=32,  
        lora_alpha=64,  
        lora_dropout=0.1,  
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    )

    # -----------------------------
    # Training args (TRL SFTConfig; TRL==0.9.6)
    # -----------------------------

    train_args = SFTConfig(
        output_dir=str(OUTPUT_DIR),
        num_train_epochs=15,  
        per_device_train_batch_size=2,  
        gradient_accumulation_steps=8,  
        learning_rate=5e-5, 
        lr_scheduler_type="cosine_with_restarts",
        warmup_ratio=0.1,  
        bf16=True,
        logging_steps=10,
        save_strategy="epoch",
        eval_strategy="epoch",
        gradient_checkpointing=True,
        report_to="tensorboard",
        save_total_limit=3,  
        seed=SEED,
        packing=True,
        dataset_text_field="text",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_dir=str(OUTPUT_DIR / "logs"),
        logging_first_step=True,
        remove_unused_columns=False,
    )
    # -----------------------------
    # Data IO
    # -----------------------------
    def load_jsonl(path: Path) -> Dataset:
        rows = []
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                obj = json.loads(line)
                instr = (obj.get("instruction") or "").strip()
                resp  = (obj.get("response") or "").strip()
                if instr and resp:
                    rows.append({"instruction": instr, "response": resp})
        if not rows:
            raise ValueError(f"No valid rows found in {path}")
        return Dataset.from_list(rows)

    train_ds = load_jsonl(TRAIN_PATH)
    dev_ds   = load_jsonl(DEV_PATH)
    TAG_RE = re.compile(r'^\s*<\s*DIALECT\s*=\s*(HIJAZI|NAJDI)\s*>\s*', re.IGNORECASE)
    EOS = "</s>"

    removed_ct = 0
    total_ct   = 0
    def fmt_no_tag(example):
        global removed_ct, total_ct
        instr = example["instruction"]
        total_ct += 1
        stripped = TAG_RE.sub("", instr).lstrip()
        if stripped != instr:
            removed_ct += 1
        resp = example["response"]
        text = f"### Instruction:\n{stripped}\n\n### Response:\n{resp}{EOS}"
        return {"text": text}

    train_text = train_ds.map(fmt_no_tag, remove_columns=[c for c in train_ds.column_names if c!="text"])
    dev_text   = dev_ds.map(fmt_no_tag,   remove_columns=[c for c in dev_ds.column_names if c!="text"])

    print(f"Train examples: {len(train_text):,} | Dev examples: {len(dev_text):,}")
    print(f"Leading <DIALECT=…> tags removed in formatting: {removed_ct}/{total_ct} "
        f"({(removed_ct/max(1,total_ct))*100:.2f}%)")

    # -----------------------------
    # Tokenizer & Base model
    # -----------------------------
    tok = AutoTokenizer.from_pretrained(BASE_MODEL)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model.config.use_cache = False

    # -----------------------------
    # Step logging
    # -----------------------------
    class LoggingCallback(TrainerCallback):
        def __init__(self):
            self.t0 = time.time()
            self.tlast = self.t0

        def on_step_end(self, args, state, control, logs=None, **kwargs):
            """Log every training step"""
            if not state.is_local_process_zero:
                return
            logs = logs or {}
            now = time.time()
            train_loss = logs.get("loss")
            if train_loss is not None:
                print(
                    f"[Step {state.global_step}] epoch={state.epoch:.2f} "
                    f"train_loss={train_loss:.4f} "
                    f"lr={logs.get('learning_rate', 'N/A'):.2e} "
                    f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                    flush=True
                )
                self.tlast = now

        def on_evaluate(self, args, state, control, metrics=None, **kwargs):
            """Log evaluation results"""
            if not state.is_local_process_zero:
                return
            metrics = metrics or {}
            now = time.time()
            
            eval_loss = metrics.get("eval_loss")
            if eval_loss is not None:
                print(
                    f"[EVAL] epoch={state.epoch:.2f} "
                    f"eval_loss={eval_loss:.4f} "
                    f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                    flush=True
                )
                self.tlast = now

        def on_log(self, args, state, control, logs=None, **kwargs):
            """Fallback logging"""
            if not state.is_local_process_zero:
                return
            logs = logs or {}
            now = time.time()
            if "loss" in logs or "eval_loss" in logs:
                train_loss = logs.get("loss", "N/A")
                eval_loss = logs.get("eval_loss", "N/A")
                lr = logs.get("learning_rate", "N/A")
                
                print(
                    f"[LOG] epoch={state.epoch:.2f} "
                    f"train_loss={train_loss} "
                    f"eval_loss={eval_loss} "
                    f"lr={lr} "
                    f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                    flush=True
                )
                self.tlast = now



    # -----------------------------
    # Trainer (TRL + LoRA)
    # -----------------------------
    trainer = SFTTrainer(
        model=model,
        tokenizer=tok,  
        train_dataset=train_text,
        eval_dataset=dev_text,
        peft_config=lora_config,
        args=train_args,
        max_seq_length=MAX_SEQ_LEN,  
    )


    try:
        trainer.model.print_trainable_parameters()
    except Exception:
        pass

    trainer.add_callback(LoggingCallback())
    print("Starting NO-TAG training…", flush=True)
    trainer.train()

    # -----------------------------
    # Save adapters & tokenizer
    # -----------------------------
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    trainer.save_model()
    tok.save_pretrained(OUTPUT_DIR)
    print("✅ Done. LoRA (NO TAG) adapters saved to:", OUTPUT_DIR)

    # -----------------------------
    # Final dev evaluation + perplexity
    # -----------------------------
    metrics = trainer.evaluate()
    print("Final evaluation metrics:", metrics)
    if "eval_loss" in metrics and metrics["eval_loss"] is not None:
        try:
            ppl = float(np.exp(metrics["eval_loss"]))
            print(f"Perplexity: {ppl:.3f}")
        except Exception:
            pass

    # -----------------------------
    # Save run config snapshot
    # -----------------------------
    with open(OUTPUT_DIR / "run_config.txt", "w", encoding="utf-8") as f:
        f.write(f"Seed: {SEED}\n")
        f.write(f"Base model: {BASE_MODEL}\n")
        f.write(f"Train path: {TRAIN_PATH}\n")
        f.write(f"Dev path  : {DEV_PATH}\n")
        f.write(f"Epochs: {train_args.num_train_epochs}\n")
        f.write(f"LR: {train_args.learning_rate}\n")
        f.write(f"Per-device batch: {train_args.per_device_train_batch_size}\n")
        f.write(f"Grad accum: {train_args.gradient_accumulation_steps}\n")
        f.write(f"Max seq len: {MAX_SEQ_LEN}\n")
        f.write(f"BF16: {train_args.bf16}\n")
        f.write(f"LORA r/alpha/drop: {lora_config.r}/{lora_config.lora_alpha}/{lora_config.lora_dropout}\n")
        f.write(f"Transformers: {transformers.__version__}\n")
        f.write(f"Torch: {torch.__version__}\n")
    print("Saved run_config.txt")


NO-TAG TRAIN — ENVIRONMENT CHECK
Python executable: /usr/bin/python
Python version   : 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
Torch version    : 2.2.1+cu121
Transformers ver : 4.42.3
CUDA available   : True
CUDA device count: 1
  Device 0: NVIDIA RTX 6000 Ada Generation
Datasets version : 2.19.1
PEFT version     : 0.11.1


Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Train examples: 5,122 | Dev examples: 588
Leading <DIALECT=…> tags removed in formatting: 5710/5710 (100.00%)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


trainable params: 79,953,920 || all params: 7,080,513,536 || trainable%: 1.1292
Starting NO-TAG training…




Epoch,Training Loss,Validation Loss
0,3.4369,3.009303
1,2.837,2.551401
2,2.5063,2.285813
3,2.2573,2.190359
4,2.032,2.138714
5,1.9387,2.108704
6,1.8826,2.100609
8,1.6982,2.122641
9,1.6491,2.148299
10,1.5963,2.162692


[LOG] epoch=0.08 train_loss=3.6483 eval_loss=N/A lr=2.777777777777778e-06 dt=13.33s elapsed=0.22m
[LOG] epoch=0.81 train_loss=3.4369 eval_loss=N/A lr=2.777777777777778e-05 dt=119.62s elapsed=2.22m
[LOG] epoch=0.97 train_loss=N/A eval_loss=3.0093026161193848 lr=N/A dt=38.37s elapsed=2.86m
[EVAL] epoch=0.97 eval_loss=3.0093 dt=0.00s elapsed=2.86m




[LOG] epoch=1.62 train_loss=2.837 eval_loss=N/A lr=4.998119881260576e-05 dt=104.55s elapsed=4.60m
[LOG] epoch=1.94 train_loss=N/A eval_loss=2.551401138305664 lr=N/A dt=70.45s elapsed=5.77m
[EVAL] epoch=1.94 eval_loss=2.5514 dt=0.00s elapsed=5.77m




[LOG] epoch=2.42 train_loss=2.5063 eval_loss=N/A lr=4.9326121764495596e-05 dt=75.09s elapsed=7.02m
[LOG] epoch=2.99 train_loss=N/A eval_loss=2.285813093185425 lr=N/A dt=102.32s elapsed=8.73m
[EVAL] epoch=2.99 eval_loss=2.2858 dt=0.00s elapsed=8.73m




[LOG] epoch=3.23 train_loss=2.2573 eval_loss=N/A lr=4.775907352415367e-05 dt=40.26s elapsed=9.40m
[LOG] epoch=3.96 train_loss=N/A eval_loss=2.190358877182007 lr=N/A dt=134.23s elapsed=11.64m
[EVAL] epoch=3.96 eval_loss=2.1904 dt=0.00s elapsed=11.64m




[LOG] epoch=4.04 train_loss=2.1158 eval_loss=N/A lr=4.533880175657419e-05 dt=9.00s elapsed=11.79m
[LOG] epoch=4.85 train_loss=2.032 eval_loss=N/A lr=4.215604094671835e-05 dt=134.39s elapsed=14.03m
[LOG] epoch=4.93 train_loss=N/A eval_loss=2.1387135982513428 lr=N/A dt=31.59s elapsed=14.55m
[EVAL] epoch=4.93 eval_loss=2.1387 dt=0.00s elapsed=14.55m




[LOG] epoch=5.66 train_loss=1.9387 eval_loss=N/A lr=3.8330110820042285e-05 dt=111.91s elapsed=16.42m
[LOG] epoch=5.98 train_loss=N/A eval_loss=2.10870361328125 lr=N/A dt=63.63s elapsed=17.48m
[EVAL] epoch=5.98 eval_loss=2.1087 dt=0.00s elapsed=17.48m




[LOG] epoch=6.46 train_loss=1.8826 eval_loss=N/A lr=3.400444312011776e-05 dt=79.24s elapsed=18.80m
[LOG] epoch=6.95 train_loss=N/A eval_loss=2.1006088256835938 lr=N/A dt=95.51s elapsed=20.39m
[EVAL] epoch=6.95 eval_loss=2.1006 dt=0.00s elapsed=20.39m




[LOG] epoch=7.27 train_loss=1.7975 eval_loss=N/A lr=2.9341204441673266e-05 dt=47.36s elapsed=21.18m
[LOG] epoch=8.00 train_loss=N/A eval_loss=2.110795736312866 lr=N/A dt=127.56s elapsed=23.31m
[EVAL] epoch=8.00 eval_loss=2.1108 dt=0.00s elapsed=23.31m




[LOG] epoch=8.08 train_loss=1.7476 eval_loss=N/A lr=2.4515216705704395e-05 dt=15.33s elapsed=23.56m
[LOG] epoch=8.89 train_loss=1.6982 eval_loss=N/A lr=1.970740319426474e-05 dt=134.44s elapsed=25.80m
[LOG] epoch=8.97 train_loss=N/A eval_loss=2.122641086578369 lr=N/A dt=24.91s elapsed=26.22m
[EVAL] epoch=8.97 eval_loss=2.1226 dt=0.00s elapsed=26.22m




[LOG] epoch=9.70 train_loss=1.6491 eval_loss=N/A lr=1.509800584902108e-05 dt=117.93s elapsed=28.18m
[LOG] epoch=9.94 train_loss=N/A eval_loss=2.148298740386963 lr=N/A dt=56.90s elapsed=29.13m
[EVAL] epoch=9.94 eval_loss=2.1483 dt=0.00s elapsed=29.13m




[LOG] epoch=10.51 train_loss=1.5963 eval_loss=N/A lr=1.085982811283654e-05 dt=85.72s elapsed=30.56m
[LOG] epoch=10.99 train_loss=N/A eval_loss=2.1626923084259033 lr=N/A dt=88.95s elapsed=32.04m
[EVAL] epoch=10.99 eval_loss=2.1627 dt=0.00s elapsed=32.04m




[LOG] epoch=11.31 train_loss=1.6065 eval_loss=N/A lr=7.1517566360525284e-06 dt=53.96s elapsed=32.94m
[LOG] epoch=11.96 train_loss=N/A eval_loss=2.1739745140075684 lr=N/A dt=120.98s elapsed=34.96m
[EVAL] epoch=11.96 eval_loss=2.1740 dt=0.00s elapsed=34.96m




[LOG] epoch=12.12 train_loss=1.555 eval_loss=N/A lr=4.112804714676594e-06 dt=21.83s elapsed=35.32m
[LOG] epoch=12.93 train_loss=1.5634 eval_loss=N/A lr=1.8569007682777417e-06 dt=134.67s elapsed=37.57m
[LOG] epoch=12.93 train_loss=N/A eval_loss=2.1778132915496826 lr=N/A dt=18.21s elapsed=37.87m
[EVAL] epoch=12.93 eval_loss=2.1778 dt=0.00s elapsed=37.87m




[LOG] epoch=13.74 train_loss=1.5617 eval_loss=N/A lr=4.6861723431538276e-07 dt=124.48s elapsed=39.95m
[LOG] epoch=13.98 train_loss=N/A eval_loss=2.1778223514556885 lr=N/A dt=50.17s elapsed=40.78m
[EVAL] epoch=13.98 eval_loss=2.1778 dt=0.00s elapsed=40.78m




[LOG] epoch=14.55 train_loss=1.5522 eval_loss=N/A lr=0.0 dt=92.55s elapsed=42.32m
[LOG] epoch=14.55 train_loss=N/A eval_loss=2.177546501159668 lr=N/A dt=8.13s elapsed=42.46m
[EVAL] epoch=14.55 eval_loss=2.1775 dt=0.00s elapsed=42.46m
✅ Done. LoRA (NO TAG) adapters saved to: outputs/allam7b-lora-no-token-15EPOCH


[LOG] epoch=14.55 train_loss=N/A eval_loss=2.1006088256835938 lr=N/A dt=9.82s elapsed=42.62m
[EVAL] epoch=14.55 eval_loss=2.1006 dt=0.00s elapsed=42.62m
Final evaluation metrics: {'eval_loss': 2.1006088256835938, 'eval_runtime': 6.3647, 'eval_samples_per_second': 3.614, 'eval_steps_per_second': 0.471, 'epoch': 14.545454545454545}
Perplexity: 8.171
Saved run_config.txt


In [5]:
print("Best eval_loss:", trainer.state.best_metric)
print("Best checkpoint path:", trainer.state.best_model_checkpoint)


Best eval_loss: 2.1006088256835938
Best checkpoint path: outputs/allam7b-lora-no-token-15EPOCH/checkpoint-86
