## Load the Data

In [None]:

import json, random, math
from pathlib import Path
from collections import defaultdict, Counter

IN_PATH   = Path("saudi_dataset_all.jsonl")
OUT_DIR   = Path("data_splits")
SPLIT_SEED = 42
SPLITS = (0.80, 0.10, 0.10)  # train/dev/test

OUT_DIR.mkdir(parents=True, exist_ok=True)

def read_jsonl(p):
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if line:
                yield json.loads(line)

def get_dialect(obj):
    
    if "dialect" in obj and obj["dialect"]:
        return str(obj["dialect"]).strip()
    meta = obj.get("meta") or {}
    return str(meta.get("dialect","")).strip()

def get_topic(obj):
    meta = obj.get("meta") or {}
    return str(meta.get("topic","_NA_")).strip()

def get_length(obj):
    meta = obj.get("meta") or {}
    return str(meta.get("length","_NA_")).strip()

rows = list(read_jsonl(IN_PATH))
print(f"Loaded {len(rows):,} items")

def bucket_of(ex):
    d = get_dialect(ex)
    t = get_topic(ex)
    L = get_length(ex)
    return (d, t, L)

groups = defaultdict(list)
for ex in rows:
    groups[bucket_of(ex)].append(ex)

random.seed(SPLIT_SEED)
train, dev, test = [], [], []
for key, lst in groups.items():
    random.shuffle(lst)
    n = len(lst)
    n_tr = math.floor(SPLITS[0]*n)
    n_de = math.floor(SPLITS[1]*n)
    train += lst[:n_tr]
    dev   += lst[n_tr:n_tr+n_de]
    test  += lst[n_tr+n_de:]

def dump(path, data):
    with open(path, "w", encoding="utf-8") as out:
        for r in data:
            out.write(json.dumps({
                "instruction": r.get("instruction",""),        
                "response":    r.get("response",""),
                "dialect":     get_dialect(r)                  
            }, ensure_ascii=False)+"\n")

dump(OUT_DIR/"train.jsonl", train)
dump(OUT_DIR/"dev.jsonl",   dev)
dump(OUT_DIR/"test.jsonl",  test)

def show_counts(name, rows_):
    c = Counter(get_dialect(r) for r in rows_)
    total = len(rows_) or 1
    print(f"{name}: {len(rows_):,}")
    for k,v in c.items():
        print(f"  {k:6s}: {v:5d}  ({100*v/total:5.2f}%)")

show_counts("Train", train)
show_counts("Dev",   dev)
show_counts("Test",  test)
print("Saved splits to", OUT_DIR)


Loaded 6,546 items
Train: 5,122
  Hijazi:  2650  (51.74%)
  Najdi :  2461  (48.05%)
  HIJAZI:     5  ( 0.10%)
  NAJDI :     6  ( 0.12%)
Dev: 588
  Hijazi:   304  (51.70%)
  Najdi :   284  (48.30%)
Test: 836
  Hijazi:   413  (49.40%)
  Najdi :   385  (46.05%)
  HIJAZI:    20  ( 2.39%)
  NAJDI :    18  ( 2.15%)
Saved splits to data_splits


## Training ALLaM using LoRA + Tag, 10 EPOCH 

In [None]:
import os, sys, json, time, random, numpy as np, torch, transformers
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from transformers import set_seed


os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
SEED = 42
set_seed(SEED)
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
torch.cuda.empty_cache()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("="*60)
print("ENVIRONMENT CHECK")
print(f"Python executable: {sys.executable}")
print(f"Python version   : {sys.version}")
print(f"Torch version    : {torch.__version__}")
print(f"Transformers ver : {transformers.__version__}")
print(f"CUDA available   : {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No CUDA device detected!")
try:
    import datasets; print(f"Datasets version : {datasets.__version__}")
except Exception: print("Datasets not installed?")
try:
    import peft; print(f"PEFT version     : {peft.__version__}")
except Exception: print("PEFT not installed?")
print("="*60)

TRAIN_PATH = Path("data_splits/train.jsonl")
DEV_PATH   = Path("data_splits/dev.jsonl")
OUTPUT_DIR = Path("outputs/allam7b-lora-token-10EPOCH")
BASE_MODEL = "ALLaM-AI/ALLaM-7B-Instruct-preview"
MAX_SEQ_LEN = 2048  


lora_config = LoraConfig(
    r=32,  
    lora_alpha=64,  
    lora_dropout=0.1, 
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)


train_args = SFTConfig(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=10,  
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=8,  
    learning_rate=5e-5,  
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.1,  
    bf16=True,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    gradient_checkpointing=True,
    report_to="tensorboard",
    save_total_limit=3,  
    seed=SEED,
    packing=True,
    dataset_text_field="text",
)

def load_jsonl(path: Path) -> Dataset:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip(): continue
            obj = json.loads(line)
            instr = (obj.get("instruction") or "").strip()
            resp  = (obj.get("response") or "").strip()
            if instr and resp:
                rows.append({"instruction": instr, "response": resp})
    if not rows:
        raise ValueError(f"No valid rows found in {path}")
    return Dataset.from_list(rows)

train_ds = load_jsonl(TRAIN_PATH)
dev_ds   = load_jsonl(DEV_PATH)

EOS = "</s>"
def fmt(example):
    instr = example["instruction"]  
    resp  = example["response"]
    text = f"### Instruction:\n{instr}\n\n### Response:\n{resp}{EOS}"
    return {"text": text}

train_text = train_ds.map(fmt, remove_columns=[c for c in train_ds.column_names if c!="text"])
dev_text   = dev_ds.map(fmt,   remove_columns=[c for c in dev_ds.column_names if c!="text"])

print(f"Train examples: {len(train_text):,} | Dev examples: {len(dev_text):,}")


tok = AutoTokenizer.from_pretrained(BASE_MODEL)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.config.use_cache = False


class LoggingCallback(TrainerCallback):
    def __init__(self):
        self.t0 = time.time()
        self.tlast = self.t0

    def on_log(self, args, state, control, logs=None, **kw):
        if not state.is_local_process_zero:
            return
        logs = logs or {}
        now = time.time()
        tl = logs.get("loss")
        el = logs.get("eval_loss")
    
        if "loss" in logs or "eval_loss" in logs or "learning_rate" in logs:
            print(
                f"[Step {state.global_step}] epoch={state.epoch:.2f} "
                f"train_loss={tl if tl is not None else 'N/A'} "
                f"eval_loss={el if el is not None else 'N/A'} "
                f"lr={logs.get('learning_rate','?')} "
                f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                flush=True
            )
            self.tlast = now


trainer = SFTTrainer(
    model=model,
    tokenizer=tok,  
    train_dataset=train_text,
    eval_dataset=dev_text,
    peft_config=lora_config,
    args=train_args,
    max_seq_length=MAX_SEQ_LEN, 
)


try:
    trainer.model.print_trainable_parameters()
except Exception:
    pass

trainer.add_callback(LoggingCallback())

print("Starting training…", flush=True)
trainer.train()


OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.save_model()
tok.save_pretrained(OUTPUT_DIR)
print("✅ Done. LoRA adapters saved to:", OUTPUT_DIR)


metrics = trainer.evaluate()
print("Final evaluation metrics:", metrics)
if "eval_loss" in metrics and metrics["eval_loss"] is not None:
    try:
        ppl = float(np.exp(metrics["eval_loss"]))
        print(f"Perplexity: {ppl:.3f}")
    except Exception:
        pass


with open(OUTPUT_DIR / "run_config.txt", "w", encoding="utf-8") as f:
    f.write(f"Seed: {SEED}\n")
    f.write(f"Base model: {BASE_MODEL}\n")
    f.write(f"Train path: {TRAIN_PATH}\n")
    f.write(f"Dev path  : {DEV_PATH}\n")
    f.write(f"Epochs: {train_args.num_train_epochs}\n")
    f.write(f"LR: {train_args.learning_rate}\n")
    f.write(f"Per-device batch: {train_args.per_device_train_batch_size}\n")
    f.write(f"Grad accum: {train_args.gradient_accumulation_steps}\n")
    f.write(f"Max seq len: {MAX_SEQ_LEN}\n")
    f.write(f"BF16: {train_args.bf16}\n")
    f.write(f"LORA r/alpha/drop: {lora_config.r}/{lora_config.lora_alpha}/{lora_config.lora_dropout}\n")
    f.write(f"Transformers: {transformers.__version__}\n")
    f.write(f"Torch: {torch.__version__}\n")
print("Saved run_config.txt")


ENVIRONMENT CHECK
Python executable: /usr/bin/python
Python version   : 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
Torch version    : 2.3.1+cu121
Transformers ver : 4.42.3
CUDA available   : True
CUDA device count: 1
  Device 0: NVIDIA RTX 6000 Ada Generation
Datasets version : 2.19.1
PEFT version     : 0.11.1


Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Train examples: 5,122 | Dev examples: 588


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


trainable params: 79,953,920 || all params: 7,080,513,536 || trainable%: 1.1292
Starting training…




Epoch,Training Loss,Validation Loss
0,3.0835,2.590824
1,2.4732,2.168015
2,1.9813,2.007209
3,1.8898,1.946524
4,1.8199,1.914637
5,1.725,1.898293
6,1.697,1.893849
8,1.6447,1.89251
9,1.6387,1.892369


[Step 10] Epoch 0.72 | Train: N/A | Eval: N/A | Step: 133.36s | Elapsed: 2.22m




[Step 20] Epoch 1.44 | Train: N/A | Eval: N/A | Step: 141.90s | Elapsed: 4.59m




[Step 30] Epoch 2.16 | Train: N/A | Eval: N/A | Step: 142.33s | Elapsed: 6.96m
[Step 40] Epoch 2.88 | Train: N/A | Eval: N/A | Step: 134.56s | Elapsed: 9.20m




[Step 50] Epoch 3.60 | Train: N/A | Eval: N/A | Step: 142.07s | Elapsed: 11.57m




[Step 60] Epoch 4.32 | Train: N/A | Eval: N/A | Step: 142.56s | Elapsed: 13.95m




[Step 70] Epoch 5.05 | Train: N/A | Eval: N/A | Step: 142.22s | Elapsed: 16.32m
[Step 80] Epoch 5.77 | Train: N/A | Eval: N/A | Step: 134.47s | Elapsed: 18.56m




[Step 90] Epoch 6.49 | Train: N/A | Eval: N/A | Step: 141.72s | Elapsed: 20.92m




[Step 100] Epoch 7.21 | Train: N/A | Eval: N/A | Step: 142.53s | Elapsed: 23.30m
[Step 110] Epoch 7.93 | Train: N/A | Eval: N/A | Step: 134.48s | Elapsed: 25.54m




[Step 120] Epoch 8.65 | Train: N/A | Eval: N/A | Step: 142.23s | Elapsed: 27.91m




[Step 130] Epoch 9.37 | Train: N/A | Eval: N/A | Step: 142.69s | Elapsed: 30.29m
✅ Done. LoRA adapters saved to: outputs/allam7b-lora-token-10EPOCH


Final evaluation metrics: {'eval_loss': 1.892369270324707, 'eval_runtime': 6.8343, 'eval_samples_per_second': 3.658, 'eval_steps_per_second': 0.585, 'epoch': 9.36936936936937}
Perplexity: 6.635
Saved run_config.txt


## Training ALLaM using LoRA + No Tag, 10EPOCH

In [None]:
TRAIN_PATH = Path("data_splits/train.jsonl")
DEV_PATH   = Path("data_splits/dev.jsonl")
OUTPUT_DIR = Path("outputs/allam7b-lora-no-token-10EPOCH")
BASE_MODEL = "ALLaM-AI/ALLaM-7B-Instruct-preview"
MAX_SEQ_LEN = 2048


lora_config = LoraConfig(
    r=32,  #
    lora_alpha=64,  
    lora_dropout=0.1,  
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)


train_args = SFTConfig(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=10, 
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=8, 
    learning_rate=5e-5, 
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.1, 
    bf16=True,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    gradient_checkpointing=True,
    report_to="tensorboard",
    save_total_limit=3, 
    seed=SEED,
    packing=True,
    dataset_text_field="text",
)


def load_jsonl(path: Path) -> Dataset:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)
            instr = (obj.get("instruction") or "").strip()
            resp  = (obj.get("response") or "").strip()
            if instr and resp:
                rows.append({"instruction": instr, "response": resp})
    if not rows:
        raise ValueError(f"No valid rows found in {path}")
    return Dataset.from_list(rows)

train_ds = load_jsonl(TRAIN_PATH)
dev_ds   = load_jsonl(DEV_PATH)

TAG_RE = re.compile(r'^\s*<\s*DIALECT\s*=\s*(HIJAZI|NAJDI)\s*>\s*', re.IGNORECASE)
EOS = "</s>"

removed_ct = 0
total_ct   = 0
def fmt_no_tag(example):
    global removed_ct, total_ct
    instr = example["instruction"]
    total_ct += 1
    stripped = TAG_RE.sub("", instr).lstrip()
    if stripped != instr:
        removed_ct += 1
    resp = example["response"]
    text = f"### Instruction:\n{stripped}\n\n### Response:\n{resp}{EOS}"
    return {"text": text}

train_text = train_ds.map(fmt_no_tag, remove_columns=[c for c in train_ds.column_names if c!="text"])
dev_text   = dev_ds.map(fmt_no_tag,   remove_columns=[c for c in dev_ds.column_names if c!="text"])

print(f"Train examples: {len(train_text):,} | Dev examples: {len(dev_text):,}")
print(f"Leading <DIALECT=…> tags removed in formatting: {removed_ct}/{total_ct} "
      f"({(removed_ct/max(1,total_ct))*100:.2f}%)")


tok = AutoTokenizer.from_pretrained(BASE_MODEL)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.config.use_cache = False

class LoggingCallback(TrainerCallback):
    def __init__(self):
        self.t0 = time.time()
        self.tlast = self.t0

    def on_log(self, args, state, control, logs=None, **kw):
        if not state.is_local_process_zero:
            return
        logs = logs or {}
        now = time.time()
        tl = logs.get("loss")
        el = logs.get("eval_loss")
        if "loss" in logs or "eval_loss" in logs or "learning_rate" in logs:
            print(
                f"[Step {state.global_step}] epoch={state.epoch:.2f} "
                f"train_loss={tl if tl is not None else 'N/A'} "
                f"eval_loss={el if el is not None else 'N/A'} "
                f"lr={logs.get('learning_rate','?')} "
                f"dt={now-self.tlast:.2f}s elapsed={(now-self.t0)/60:.2f}m",
                flush=True
            )
            self.tlast = now



trainer = SFTTrainer(
    model=model,
    tokenizer=tok,  
    train_dataset=train_text,
    eval_dataset=dev_text,
    peft_config=lora_config,
    args=train_args,
    max_seq_length=MAX_SEQ_LEN,  
)


try:
    trainer.model.print_trainable_parameters()
except Exception:
    pass

trainer.add_callback(LoggingCallback())
print("Starting NO-TAG training…", flush=True)
trainer.train()


OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.save_model()
tok.save_pretrained(OUTPUT_DIR)
print("✅ Done. LoRA (NO TAG) adapters saved to:", OUTPUT_DIR)

metrics = trainer.evaluate()
print("Final evaluation metrics:", metrics)
if "eval_loss" in metrics and metrics["eval_loss"] is not None:
    try:
        ppl = float(np.exp(metrics["eval_loss"]))
        print(f"Perplexity: {ppl:.3f}")
    except Exception:
        pass

with open(OUTPUT_DIR / "run_config.txt", "w", encoding="utf-8") as f:
    f.write(f"Seed: {SEED}\n")
    f.write(f"Base model: {BASE_MODEL}\n")
    f.write(f"Train path: {TRAIN_PATH}\n")
    f.write(f"Dev path  : {DEV_PATH}\n")
    f.write(f"Epochs: {train_args.num_train_epochs}\n")
    f.write(f"LR: {train_args.learning_rate}\n")
    f.write(f"Per-device batch: {train_args.per_device_train_batch_size}\n")
    f.write(f"Grad accum: {train_args.gradient_accumulation_steps}\n")
    f.write(f"Max seq len: {MAX_SEQ_LEN}\n")
    f.write(f"BF16: {train_args.bf16}\n")
    f.write(f"LORA r/alpha/drop: {lora_config.r}/{lora_config.lora_alpha}/{lora_config.lora_dropout}\n")
    f.write(f"Transformers: {transformers.__version__}\n")
    f.write(f"Torch: {torch.__version__}\n")
print("Saved run_config.txt")


NO-TAG TRAIN — ENVIRONMENT CHECK
Python executable: /usr/bin/python
Python version   : 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
Torch version    : 2.3.1+cu121
Transformers ver : 4.42.3
CUDA available   : True
CUDA device count: 1
  Device 0: NVIDIA RTX 6000 Ada Generation
Datasets version : 2.19.1
PEFT version     : 0.11.1


Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Train examples: 5,122 | Dev examples: 588
Leading <DIALECT=…> tags removed in formatting: 5710/5710 (100.00%)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


trainable params: 79,953,920 || all params: 7,080,513,536 || trainable%: 1.1292
Starting NO-TAG training…




Epoch,Training Loss,Validation Loss
0,3.3985,2.890326
1,2.7313,2.469382
2,2.4491,2.253285
3,2.2087,2.182941
4,2.026,2.142709
5,1.9568,2.123169
6,1.9292,2.115768
8,1.8648,2.111516
9,1.8557,2.111593


[Step 10] epoch=0.81 train_loss=3.3985 eval_loss=N/A lr=4.166666666666667e-05 dt=133.32s elapsed=2.22m
[Step 12] epoch=0.97 train_loss=N/A eval_loss=2.890326499938965 lr=? dt=38.37s elapsed=2.86m




[Step 20] epoch=1.62 train_loss=2.7313 eval_loss=N/A lr=4.9326121764495596e-05 dt=104.38s elapsed=4.60m
[Step 24] epoch=1.94 train_loss=N/A eval_loss=2.4693820476531982 lr=? dt=70.34s elapsed=5.77m




[Step 30] epoch=2.42 train_loss=2.4491 eval_loss=N/A lr=4.665063509461097e-05 dt=72.41s elapsed=6.98m
[Step 37] epoch=2.99 train_loss=N/A eval_loss=2.2532846927642822 lr=? dt=102.31s elapsed=8.69m




[Step 40] epoch=3.23 train_loss=2.2087 eval_loss=N/A lr=4.215604094671835e-05 dt=40.23s elapsed=9.36m
[Step 49] epoch=3.96 train_loss=N/A eval_loss=2.182940721511841 lr=? dt=134.32s elapsed=11.59m




[Step 50] epoch=4.04 train_loss=2.0941 eval_loss=N/A lr=3.621997950501156e-05 dt=8.38s elapsed=11.73m
[Step 60] epoch=4.85 train_loss=2.026 eval_loss=N/A lr=2.9341204441673266e-05 dt=134.76s elapsed=13.98m
[Step 61] epoch=4.93 train_loss=N/A eval_loss=2.142709493637085 lr=? dt=31.66s elapsed=14.51m




[Step 70] epoch=5.66 train_loss=1.9568 eval_loss=N/A lr=2.2097677146869242e-05 dt=110.83s elapsed=16.36m
[Step 74] epoch=5.98 train_loss=N/A eval_loss=2.123169183731079 lr=? dt=63.75s elapsed=17.42m




[Step 80] epoch=6.46 train_loss=1.9292 eval_loss=N/A lr=1.509800584902108e-05 dt=79.14s elapsed=18.74m
[Step 86] epoch=6.95 train_loss=N/A eval_loss=2.1157681941986084 lr=? dt=95.68s elapsed=20.33m




[Step 90] epoch=7.27 train_loss=1.8762 eval_loss=N/A lr=8.930309757836517e-06 dt=46.77s elapsed=21.11m
[Step 99] epoch=8.00 train_loss=N/A eval_loss=2.111159086227417 lr=? dt=127.52s elapsed=23.24m




[Step 100] epoch=8.08 train_loss=1.8661 eval_loss=N/A lr=4.112804714676594e-06 dt=15.01s elapsed=23.49m
[Step 110] epoch=8.89 train_loss=1.8648 eval_loss=N/A lr=1.0502621921127776e-06 dt=134.57s elapsed=25.73m
[Step 111] epoch=8.97 train_loss=N/A eval_loss=2.111515760421753 lr=? dt=24.93s elapsed=26.14m




[Step 120] epoch=9.70 train_loss=1.8557 eval_loss=N/A lr=0.0 dt=117.92s elapsed=28.11m
[Step 120] epoch=9.70 train_loss=N/A eval_loss=2.11159348487854 lr=? dt=8.08s elapsed=28.24m
✅ Done. LoRA (NO TAG) adapters saved to: outputs/allam7b-lora-no-token-10EPOCH


[Step 120] epoch=9.70 train_loss=N/A eval_loss=2.11159348487854 lr=? dt=8.90s elapsed=28.39m
Final evaluation metrics: {'eval_loss': 2.11159348487854, 'eval_runtime': 6.34, 'eval_samples_per_second': 3.628, 'eval_steps_per_second': 0.473, 'epoch': 9.696969696969697}
Perplexity: 8.261
Saved run_config.txt
