## Importing

In [None]:
from tqdm.auto import tqdm
import re, json, csv, numpy as np
from pathlib import Path
from collections import Counter

import torch, sacrebleu
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from bert_score import score as bertscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.auto import tqdm
import os, re, json, random, csv
import numpy as np
from pathlib import Path

import torch, sacrebleu
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
)
from peft import PeftModel
from bert_score import score as bertscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Logging to Hugging Face


In [None]:
login("TOKEN")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Config

In [None]:
MODEL_ID = "FreedomIntelligence/AceGPT-v2-8B-Chat"
OUT_DIR  = Path("eval_external_acegpt_v2_8b"); OUT_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR = OUT_DIR / "preds"; PRED_DIR.mkdir(parents=True, exist_ok=True)
TEST_PATH = Path("data_splits/test.jsonl")
GEN_KW = dict(max_new_tokens=256, do_sample=True, top_p=0.95, top_k=50, temperature=0.6)
USE_4BIT = False
SEED = 42

## Loading Data

In [None]:
random.seed(SEED); np.random.seed(SEED); 
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

def read_jsonl(p):
    rows=[]
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): rows.append(json.loads(line))
    return rows

test_rows = read_jsonl(TEST_PATH)
len(test_rows)


836

## Generation 

In [None]:
def apply_chat_template_safe(tok, user_text):
    if hasattr(tok, "apply_chat_template") and tok.chat_template:
        msgs=[{"role":"user","content":user_text}]
        return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    return f"### Instruction:\n{user_text}\n\n### Response:\n"

def load_model(model_id):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    if tok.pad_token is None: tok.pad_token = tok.eos_token
    kwargs = dict(torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                  device_map="auto")
    if USE_4BIT:
        kwargs["load_in_4bit"] = True
        kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True
        )
    model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs).eval()
    return tok, model

tok, model = load_model(MODEL_ID)

preds=[]
for ex in tqdm(test_rows, desc=f"Generating: {MODEL_ID.split('/')[-1]}"):
    instr = ex.get("instruction","")
    prompt = apply_chat_template_safe(tok, instr)
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, **GEN_KW, eos_token_id=tok.eos_token_id)
    txt = tok.decode(out[0], skip_special_tokens=True)
    resp = txt.split("### Response:",1)[-1].strip() if "### Response:" in txt else txt.strip()
    preds.append({
        "prompt": instr, "gold": ex.get("response",""), "pred": resp,
        "target_dialect": (ex.get("dialect") or ex.get("meta",{}).get("dialect") or "").strip()
    })

out_path = PRED_DIR / f"{MODEL_ID.split('/')[-1]}.jsonl"
with out_path.open("w", encoding="utf-8") as f:
    for r in preds: f.write(json.dumps(r, ensure_ascii=False)+"\n")
out_path


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|██████████| 4/4 [02:35<00:00, 38.96s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.66s/it]
Generating: AceGPT-v2-8B-Chat:   0%|          | 0/836 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating: AceGPT-v2-8B-Chat:   0%|          | 1/836 [00:08<1:57:32,  8.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating: AceGPT-v2-8B-Chat:   0%|          | 2/836 [00:14<1:34:55,  6.83s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating: AceGPT-v2-8B-Chat:   0%|          | 3/836 [00:16<1:08:19,  4.92s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Generating: AceGPT-v2-8B-Chat:   0%|          | 4/836 [00:23<1:19:16,  5.72s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end 

PosixPath('eval_external_acegpt_v2_8b/preds/AceGPT-v2-8B-Chat.jsonl')

## Evaluation

In [None]:
OUT_DIR  = Path("eval_external_acegpt_v2_8b")
PRED_DIR = OUT_DIR / "preds"
OUT_DIR.mkdir(parents=True, exist_ok=True)
PRED_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- load dialect classifier ----------
# Classes: typically {MAGH, LEV, MSA, GLF, EGY}
DID_ID   = "IbrahimAmin/marbertv2-arabic-written-dialect-classifier"
did_tok  = AutoTokenizer.from_pretrained(DID_ID, use_fast=True)
did_model= AutoModelForSequenceClassification.from_pretrained(DID_ID).to(device).eval()
id2label = did_model.config.id2label
label2id = { (v if isinstance(v,str) else v.get("name","")).upper(): int(k) for k,v in id2label.items() }

def read_jsonl(p: Path):
    rows=[]
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): rows.append(json.loads(line))
    return rows

def camel_probs_batched(texts, bs=64):
    all_probs=[]
    for i in tqdm(range(0, len(texts), bs), desc="Dialect scoring (MARBERTv2)", leave=False):
        chunk = texts[i:i+bs]
        with torch.no_grad():
            batch = did_tok(chunk, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
            logits = did_model(**batch).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            all_probs.append(probs)
    return np.vstack(all_probs) if all_probs else np.zeros((0, len(id2label)))

def labels_from_probs(probs):
    ids = probs.argmax(axis=1)
    out=[]
    for i in ids:
        key = str(i)
        lab = id2label[key] if key in id2label else id2label[i]
        lab = lab if isinstance(lab, str) else lab.get("name","")
        out.append(lab.upper().strip())
    return out

def is_saudi(lbl: str) -> bool:
    return "GLF" in lbl

MSA_IDX = None
for raw, idx in label2id.items():
    if "MSA" in raw:
        MSA_IDX = idx
        break

# ---------- helpers ----------
def tag_echo_rate(texts):
    patt = re.compile(r'<\s*DIALECT\s*=\s*[^>]+>', re.IGNORECASE)
    return 100.0 * float(np.mean([bool(patt.search(t)) for t in texts]))

def diversity_metrics(preds):
    def ngrams(tokens, n): return list(zip(*[tokens[i:] for i in range(n)]))
    total_bi=total_tri=0; uniq_bi=set(); uniq_tri=set()
    for p in preds:
        t=p.split()
        b=ngrams(t,2); g=ngrams(t,3)
        total_bi += max(1,len(b)); total_tri += max(1,len(g))
        uniq_bi.update(b); uniq_tri.update(g)
    d2 = len(uniq_bi)/total_bi if total_bi else 0.0
    d3 = len(uniq_tri)/total_tri if total_tri else 0.0
    if len(preds) < 2:
        sbleu = 0.0
    else:
        scores=[]
        for i in range(len(preds)):
            hyp=[preds[i]]
            refs=[[p for j,p in enumerate(preds) if j!=i]]
            scores.append(sacrebleu.corpus_bleu(hyp, refs).score)
        sbleu = float(np.mean(scores))
    return d2, d3, sbleu

_ar_punct = r"[^\w\s\u0600-\u06FF]"
_ar_tatweel = "\u0640"
_ar_diacritics = re.compile(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]")

def normalize_ar(text: str) -> str:
    t = text
    t = t.replace(_ar_tatweel, "")                 
    t = _ar_diacritics.sub("", t)                  
    t = re.sub(r"\s+", " ", t)                    
    t = re.sub(_ar_punct, " ", t)                  
    return t.strip()

def near_duplicate_rate(preds, thr=0.90):
    if len(preds) < 2:
        return 0.0
    norm = [normalize_ar(p) for p in preds]
    vec = TfidfVectorizer(analyzer="word", ngram_range=(1,3), min_df=1)
    X = vec.fit_transform(norm)
    sims = cosine_similarity(X)
    n = X.shape[0]; cnt = 0; denom = n*(n-1)/2
    for i in range(n):
        for j in range(i+1, n):
            if sims[i, j] >= thr:
                cnt += 1
    return 100.0 * cnt / max(1, denom)

try:
    pred_files
except NameError:
    pred_files = sorted(PRED_DIR.glob("*.jsonl"))

if not pred_files:
    raise FileNotFoundError(f"No prediction files found in {PRED_DIR}. Make sure generation saved *.jsonl there.")


## Summarize

In [None]:

summary=[]
for pf in pred_files:
    name  = Path(pf).stem
    rows  = read_jsonl(pf)
    preds = [r["pred"] for r in rows]
    golds = [r["gold"] for r in rows]

    probs = camel_probs_batched(preds, bs=64)
    labs  = labels_from_probs(probs)
    conf  = probs.max(axis=1)

    print(f"{name} label counts:", Counter(labs)) 

    msa_leak = float(np.mean(probs[:, MSA_IDX]))*100.0 if (MSA_IDX is not None and probs.shape[1] > MSA_IDX) else 0.0
    saudi_rate = 100.0*float(np.mean([is_saudi(x) for x in labs]))
    low_conf   = 100.0*float(np.mean(conf < 0.55))
    echo       = tag_echo_rate(preds)

    chrf = sacrebleu.corpus_chrf(preds, [golds]).score
    P,R,F = bertscore(preds, golds, lang="ar", rescale_with_baseline=True, device=device)
    bert_f1 = float(F.mean().item())

    d2, d3, sbleu = diversity_metrics(preds)
    near_dup = near_duplicate_rate(preds, thr=0.90)

    with (OUT_DIR / f"{name}_report.json").open("w", encoding="utf-8") as f:
        json.dump({
            "model": name, "n": len(rows),
            "saudi_rate_pct": saudi_rate,
            "msa_leak_pct": msa_leak,
            "low_conf_pct": low_conf,
            "tag_echo_pct": echo,
            "chrF++": chrf,
            "BERTScore_F1": bert_f1,
            "distinct2": d2, "distinct3": d3, "selfBLEU": sbleu,
            "near_duplicate_pct": near_dup
        }, f, ensure_ascii=False, indent=2)

    summary.append([
        name, round(saudi_rate,2), round(msa_leak,2), round(low_conf,2),
        round(echo,2), round(chrf,2), round(bert_f1,4),
        round(d2,4), round(d3,4), round(sbleu,2), round(near_dup,2)
    ])

csv_path = OUT_DIR / "summary_saudi_only.csv"
with csv_path.open("w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow([
        "Model","Saudi% (GLF) ↑","MSA leak% ↓","Low-conf% ↓",
        "Tag-echo% ↓","chrF++ ↑","BERTScore F1 ↑",
        "distinct-2 ↑","distinct-3 ↑","Self-BLEU ↓","Near-dup% ↓"
    ])
    for row in summary: w.writerow(row)

print("\n== Summary ==")
for row in summary: print(row)
print(f"\nSaved: {csv_path}")


                                                                            

AceGPT-v2-8B-Chat label counts: Counter({'GLF': 568, 'MSA': 183, 'LEV': 79, 'MAGHREB': 3, 'EGY': 3})





== Summary ==
['AceGPT-v2-8B-Chat', 67.94, 22.02, 6.94, 83.97, 21.59, 0.6688, 0.7902, 0.9409, 0.31, 0.0]

Saved: eval_external_acegpt_v2_8b/summary_saudi_only.csv
