## Importing

In [None]:
from tqdm.auto import tqdm
import re, json, csv, numpy as np
from pathlib import Path
from collections import Counter

import torch, sacrebleu
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from bert_score import score as bertscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.auto import tqdm
import os, re, json, random, csv
import numpy as np
from pathlib import Path

import torch, sacrebleu
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
)
from peft import PeftModel
from bert_score import score as bertscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Logging to HuggingFace

In [None]:
login("Token")


## Quick-run

In [None]:
MAX_EVAL = 0           # 0 = use all test examples; set to e.g. 20/100 for a small run
RANDOM_SAMPLE = True   
BATCH_SIZE_GEN = 1     
BATCH_SIZE_CLF = 64    # CAMeLBERT inference batch size
SEED = 42

## Paths

In [None]:

BASE_MODEL_ID = "ALLaM-AI/ALLaM-7B-Instruct-preview"
TEST_PATH = Path("data_splits/test.jsonl")   # test split (836 items)
OUT_DIR = Path("eval_saudi_only"); OUT_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR = OUT_DIR / "preds"; PRED_DIR.mkdir(parents=True, exist_ok=True)

MODEL_SPECS = [
    ("allam_base",             None,                                  False),
    ("lora-no-token-15EPOCH",  "outputs/allam7b-lora-no-token-15EPOCH", True),
    ("lora-token-15EPOCH",     "outputs/allam7b-lora-token-15EPOCH",  False),
]

## Data I/O

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

def read_jsonl(p: Path):
    rows=[]
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): rows.append(json.loads(line))
    return rows

test_rows = read_jsonl(TEST_PATH)
print(f"Loaded test set: {len(test_rows):,}")

if MAX_EVAL and MAX_EVAL > 0 and MAX_EVAL < len(test_rows):
    if RANDOM_SAMPLE:
        rng = random.Random(SEED)
        idxs = sorted(rng.sample(range(len(test_rows)), MAX_EVAL))
        rows_to_use = [test_rows[i] for i in idxs]
    else:
        rows_to_use = test_rows[:MAX_EVAL]
    print(f"Using subset for eval: {len(rows_to_use)} / {len(test_rows)}")
else:
    rows_to_use = test_rows


## Generation function

In [None]:
def build_prompt(instr: str) -> str:
    return f"### Instruction:\n{instr}\n\n### Response:\n"

TAG_RE = re.compile(r'^\s*<\s*DIALECT\s*=\s*(HIJAZI|NAJDI)\s*>\s*', re.IGNORECASE)

def generate_for_model(name: str, adapter_dir: str | None, strip_tag: bool) -> Path:
    print(f"\n==> Generating with: {name}")
    tok = AutoTokenizer.from_pretrained(adapter_dir or BASE_MODEL_ID)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token  # silence pad->eos warnings

    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
    )
    model = base
    if adapter_dir:
        try:
            model = PeftModel.from_pretrained(base, adapter_dir)
        except Exception as e:
            print(f"WARNING: failed to load adapter {adapter_dir}; using base. {e}")

    model.eval()
    preds=[]
    gen_kwargs = dict(
        max_new_tokens=256, do_sample=True, top_p=0.95, top_k=50, temperature=0.6,
        eos_token_id=tok.eos_token_id
    )

    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i+n]

    with torch.no_grad():
        for batch in tqdm(list(chunks(rows_to_use, BATCH_SIZE_GEN)), desc=f"Generating ({name})", leave=False):
            for ex in batch:
                instr = ex.get("instruction","")
                if strip_tag:
                    instr = TAG_RE.sub("", instr).lstrip()
                inputs = tok(build_prompt(instr), return_tensors="pt").to(model.device)
                out = model.generate(**inputs, **gen_kwargs)
                text = tok.decode(out[0], skip_special_tokens=True)
                resp = text.split("### Response:", 1)[-1].strip()
                preds.append({
                    "prompt": instr,
                    "gold": ex.get("response",""),
                    "pred": resp,
                    "target_dialect": (ex.get("dialect") or ex.get("meta",{}).get("dialect") or "").strip()
                })

    out_path = PRED_DIR / f"{name}.jsonl"
    with out_path.open("w", encoding="utf-8") as f:
        for r in preds:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"Saved -> {out_path}")
    return out_path

## Evaluation

In [None]:
pred_files=[]
for spec in tqdm(MODEL_SPECS, desc="Models", position=0):
    pred_files.append(generate_for_model(*spec))
    
# =========================
# EVALUATION (full test set)
# Dialect judge: IbrahimAmin/marbertv2-arabic-written-dialect-classifier
# =========================

OUT_DIR = Path("eval_saudi_only")
PRED_DIR = OUT_DIR / "preds"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- load 5-way written dialect classifier ----------
# Classes: typically {MAGH, LEV, MSA, GLF, EGY}
DID_ID   = "IbrahimAmin/marbertv2-arabic-written-dialect-classifier"
did_tok  = AutoTokenizer.from_pretrained(DID_ID, use_fast=True)
did_model= AutoModelForSequenceClassification.from_pretrained(DID_ID).to(device).eval()
id2label = did_model.config.id2label
label2id = { (v if isinstance(v,str) else v.get("name","")).upper(): int(k) for k,v in id2label.items() }

def read_jsonl(p: Path):
    rows=[]
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): rows.append(json.loads(line))
    return rows

def camel_probs_batched(texts, bs=64):
    all_probs=[]
    for i in tqdm(range(0, len(texts), bs), desc="Dialect scoring (MARBERTv2)", leave=False):
        chunk = texts[i:i+bs]
        with torch.no_grad():
            batch = did_tok(chunk, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
            logits = did_model(**batch).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs) if all_probs else np.zeros((0, len(id2label)))

def labels_from_probs(probs):
    ids = probs.argmax(axis=1)
    out=[]
    for i in ids:
        key = str(i)
        lab = id2label[key] if key in id2label else id2label[i]
        lab = lab if isinstance(lab, str) else lab.get("name","")
        out.append(lab.upper().strip())
    return out

def is_saudi(lbl: str) -> bool:
    return "GLF" in lbl

MSA_IDX = None
for raw, idx in label2id.items():
    if "MSA" in raw:
        MSA_IDX = idx
        break

# ---------- helpers ----------
def tag_echo_rate(texts):
    patt = re.compile(r'<\s*DIALECT\s*=\s*[^>]+>', re.IGNORECASE)
    return 100.0 * float(np.mean([bool(patt.search(t)) for t in texts]))

def diversity_metrics(preds):
    def ngrams(tokens, n): return list(zip(*[tokens[i:] for i in range(n)]))
    total_bi=total_tri=0; uniq_bi=set(); uniq_tri=set()
    for p in preds:
        t=p.split()
        b=ngrams(t,2); g=ngrams(t,3)
        total_bi += max(1,len(b)); total_tri += max(1,len(g))
        uniq_bi.update(b); uniq_tri.update(g)
    d2 = len(uniq_bi)/total_bi if total_bi else 0.0
    d3 = len(uniq_tri)/total_tri if total_tri else 0.0
    if len(preds) < 2:
        sbleu = 0.0
    else:
        scores=[]
        for i in range(len(preds)):
            hyp=[preds[i]]
            refs=[[p for j,p in enumerate(preds) if j!=i]]
            scores.append(sacrebleu.corpus_bleu(hyp, refs).score)
        sbleu = float(np.mean(scores))
    return d2, d3, sbleu

# --- MORE SENSITIVE near-duplicate detector---
_ar_punct = r"[^\w\s\u0600-\u06FF]"
_ar_tatweel = "\u0640"
_ar_diacritics = re.compile(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]")

def normalize_ar(text: str) -> str:
    t = text
    t = t.replace(_ar_tatweel, "")                 
    t = _ar_diacritics.sub("", t)                  
    t = re.sub(r"\s+", " ", t)                     
    t = re.sub(_ar_punct, " ", t)                  
    return t.strip()

def near_duplicate_rate(preds, thr=0.90):
    if len(preds) < 2:
        return 0.0
    norm = [normalize_ar(p) for p in preds]
    vec = TfidfVectorizer(analyzer="word", ngram_range=(1,3), min_df=1)
    X = vec.fit_transform(norm)
    sims = cosine_similarity(X)
    n = X.shape[0]; cnt = 0; denom = n*(n-1)/2
    for i in range(n):
        for j in range(i+1, n):
            if sims[i, j] >= thr:
                cnt += 1
    return 100.0 * cnt / max(1, denom)


## Summarize

In [None]:
summary=[]
for pf in pred_files:  
    name  = Path(pf).stem
    rows  = read_jsonl(pf)
    preds = [r["pred"] for r in rows]
    golds = [r["gold"] for r in rows]

    probs = camel_probs_batched(preds, bs=64)
    labs  = labels_from_probs(probs)
    conf  = probs.max(axis=1)

    print(f"{name} label counts:", Counter(labs))  

    msa_leak   = float(np.mean(probs[:, MSA_IDX]))*100.0 if MSA_IDX is not None else 0.0
    saudi_rate = 100.0*float(np.mean([is_saudi(x) for x in labs]))
    low_conf   = 100.0*float(np.mean(conf < 0.55))
    echo       = tag_echo_rate(preds)

    chrf = sacrebleu.corpus_chrf(preds, [golds]).score
    P,R,F = bertscore(preds, golds, lang="ar", rescale_with_baseline=True)
    bert_f1 = float(F.mean().item())

    d2, d3, sbleu = diversity_metrics(preds)
    near_dup = near_duplicate_rate(preds, thr=0.90)

    with (OUT_DIR / f"{name}_report.json").open("w", encoding="utf-8") as f:
        json.dump({
            "model": name, "n": len(rows),
            "saudi_rate_pct": saudi_rate,
            "msa_leak_pct": msa_leak,
            "low_conf_pct": low_conf,
            "tag_echo_pct": echo,
            "chrF++": chrf,
            "BERTScore_F1": bert_f1,
            "distinct2": d2, "distinct3": d3, "selfBLEU": sbleu,
            "near_duplicate_pct": near_dup
        }, f, ensure_ascii=False, indent=2)

    summary.append([name, round(saudi_rate,2), round(msa_leak,2), round(low_conf,2),
                    round(echo,2), round(chrf,2), round(bert_f1,4),
                    round(d2,4), round(d3,4), round(sbleu,2), round(near_dup,2)])


csv_path = OUT_DIR / "summary_saudi_only.csv"
with csv_path.open("w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["Model","Saudi% (GLF) ↑","MSA leak% ↓","Low-conf% ↓",
                "Tag-echo% ↓","chrF++ ↑","BERTScore F1 ↑",
                "distinct-2 ↑","distinct-3 ↑","Self-BLEU ↓","Near-dup% ↓"])
    for row in summary: w.writerow(row)

print("\n== Summary ==")
for row in summary: print(row)
print(f"\nSaved: {csv_path}")


  from .autonotebook import tqdm as notebook_tqdm


Loaded test set: 836


Models:   0%|          | 0/3 [00:00<?, ?it/s]


==> Generating with: allam_base



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A
Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:01,  1.15it/s][A
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:01<00:00,  1.20it/s][A
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.26it/s][A

Generating (allam_base):   0%|          | 0/836 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (allam_base):   0%|          | 1/836 [00:07<1:42:18,  7.35s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (allam_base):   0%|          | 2/836 [00:13<1:32:39,  6.67s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (allam_base):   0%|          | 3/836 [00:17<1:13:35,  5.30s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (allam_base):   0%|          | 4/836 [00:21<1:05:39,  4.73s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open

Saved -> eval_saudi_only/preds/allam_base.jsonl

==> Generating with: lora-no-token-15EPOCH



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A
Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.20s/it][A
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.01s/it][A
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s][A

Generating (lora-no-token-15EPOCH):   0%|          | 0/836 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-no-token-15EPOCH):   0%|          | 1/836 [00:03<42:13,  3.03s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-no-token-15EPOCH):   0%|          | 2/836 [00:05<33:54,  2.44s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-no-token-15EPOCH):   0%|          | 3/836 [00:06<27:57,  2.01s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-no-token-15EPOCH):   0%|          | 4/836 [00:09<30:40,  2.21s/it][ASett

Saved -> eval_saudi_only/preds/lora-no-token-15EPOCH.jsonl

==> Generating with: lora-token-15EPOCH



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s][A
Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.14s/it][A
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:01<00:00,  1.05it/s][A
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s][A

Generating (lora-token-15EPOCH):   0%|          | 0/836 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-token-15EPOCH):   0%|          | 1/836 [00:03<45:37,  3.28s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-token-15EPOCH):   0%|          | 2/836 [00:06<45:53,  3.30s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-token-15EPOCH):   0%|          | 3/836 [00:08<36:24,  2.62s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.

Generating (lora-token-15EPOCH):   0%|          | 4/836 [00:10<31:16,  2.26s/it][ASetting `pad_token_

Saved -> eval_saudi_only/preds/lora-token-15EPOCH.jsonl


                                                                            

allam_base label counts: Counter({'GLF': 401, 'MSA': 277, 'LEV': 124, 'EGY': 20, 'MAGHREB': 14})


                                                                            

lora-no-token-15EPOCH label counts: Counter({'GLF': 673, 'MSA': 74, 'LEV': 63, 'EGY': 16, 'MAGHREB': 10})


                                                                            

lora-token-15EPOCH label counts: Counter({'GLF': 704, 'LEV': 64, 'MSA': 43, 'EGY': 17, 'MAGHREB': 8})





== Summary ==
['allam_base', 47.97, 32.63, 7.18, 3.11, 21.27, 0.6796, 0.7616, 0.9142, 1.7, 0.0]
['lora-no-token-15EPOCH', 80.5, 9.26, 4.55, 0.0, 23.7, 0.7377, 0.9038, 0.9881, 0.6, 0.0]
['lora-token-15EPOCH', 84.21, 6.21, 4.9, 0.0, 24.8, 0.7386, 0.8875, 0.9838, 0.66, 0.0]

Saved: eval_saudi_only/summary_saudi_only.csv


In [2]:
import pandas as pd
df1 = pd.read_csv("eval_saudi_only/summary_saudi_only.csv")
df1

Unnamed: 0,Model,Saudi% (GLF) ↑,MSA leak% ↓,Low-conf% ↓,Tag-echo% ↓,chrF++ ↑,BERTScore F1 ↑,distinct-2 ↑,distinct-3 ↑,Self-BLEU ↓,Near-dup% ↓
0,allam_base,47.97,32.63,7.18,3.11,21.27,0.6796,0.7616,0.9142,1.7,0.0
1,lora-no-token-15EPOCH,80.5,9.26,4.55,0.0,23.7,0.7377,0.9038,0.9881,0.6,0.0
2,lora-token-15EPOCH,84.21,6.21,4.9,0.0,24.8,0.7386,0.8875,0.9838,0.66,0.0
