In [None]:
!pip install -U transformers datasets accelerate --quiet



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# parse_quran_morph.py
import re, json
from pathlib import Path
from collections import defaultdict

INPUT = "quranic-corpus-morphology-0.4.txt"
OUT = "quran_sentences.jsonl"

# regex to capture lines with token entries:
# format: (loc) <FORM> <TAG> <FEATURES... LEM:... >
LINE_RE = re.compile(r'^\([^)]+\)\s+([^\t\s]+)\t([^\t]+)\t(.+)$')

def extract_lem_from_features(feat):
    # find LEM:... up to '|' or end
    m = re.search(r'LEM:([^|\s)]+)', feat)
    if m:
        return m.group(1)
    return None

# We will group tokens by verse location (chapter:verse:..)
LOC_RE = re.compile(r'^\((\d+:\d+):')  # captures chapter:verse start
# Actually entries look like (1:1:1:1) etc. We'll use chapter:verse pair
LOC_FULL_RE = re.compile(r'^\((\d+):(\d+):')

sent_map = defaultdict(list)  # key (chapter,verse) -> list of (token,lemma)

with open(INPUT, encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        m = LINE_RE.match(line)
        if not m:
            continue
        form, tag, feats = m.group(1), m.group(2), m.group(3)
        # normalize form: keep Arabic/buckwalter form as-is
        # extract lemma
        lem = extract_lem_from_features(feats)
        # determine location key
        loc_m = LOC_FULL_RE.match(line)
        if loc_m:
            chap = loc_m.group(1)
            verse = loc_m.group(2)
        else:
            chap = "0"; verse = "0"
        if lem is None:
            continue
        # store
        sent_map[(chap,verse)].append((form, lem))

# write JSONL by verse (preserve verse order roughly)
out = Path(OUT)
with out.open("w", encoding="utf-8") as fo:
    for k in sorted(sent_map.keys(), key=lambda x: (int(x[0]), int(x[1]))):
        toks = [t for t,l in sent_map[k]]
        lems = [l for t,l in sent_map[k]]
        fo.write(json.dumps({"tokens": toks, "labels": lems}, ensure_ascii=False) + "\n")

print("Wrote", OUT, "with", len(sent_map), "sentences")


Wrote quran_sentences.jsonl with 6216 sentences


In [None]:
# merge_aux_csv.py
import csv, json
from pathlib import Path

CSV = "quran_part3_unique.csv"
Q_SENT = "quran_sentences.jsonl"
OUT_LABEL = "merged_label2id.json"

# load existing labels from Quran sentences
labels_set = set()
with open(Q_SENT, encoding='utf-8') as f:
    for line in f:
        obj = json.loads(line)
        for l in obj['labels']:
            labels_set.add(l)

# read csv pairs and add lemmas to label set
with open(CSV, encoding='utf-8') as f:
    # try to detect delimiter simply
    sample = f.read(2048); f.seek(0)
    dialect = csv.Sniffer().sniff(sample)
    reader = csv.reader(f, dialect)
    for row in reader:
        if not row: continue
        if len(row) >= 2:
            word, lemma = row[0].strip(), row[1].strip()
        else:
            # if single column, split last token as lemma
            parts = row[0].strip().split()
            if len(parts) < 2: continue
            word, lemma = " ".join(parts[:-1]), parts[-1]
        if lemma:
            labels_set.add(lemma)

labels = sorted(labels_set)
label2id = {l:i for i,l in enumerate(labels)}

with open(OUT_LABEL, "w", encoding='utf-8') as f:
    json.dump(label2id, f, ensure_ascii=False, indent=2)

print("Merged labels:", len(labels))


Merged labels: 8102


In [None]:
# split_create.py
import json, random
from pathlib import Path

Q_SENT = "quran_sentences.jsonl"
OUT_DIR = Path("data_combined")
OUT_DIR.mkdir(exist_ok=True)
random.seed(42)

lines = [json.loads(l) for l in open(Q_SENT, encoding='utf-8')]
random.shuffle(lines)

n = len(lines)
train = lines[:int(0.8*n)]
val   = lines[int(0.8*n):int(0.9*n)]
test  = lines[int(0.9*n):]

for name,chunk in [("train",train),("val",val),("test",test)]:
    with open(OUT_DIR/f"{name}.jsonl","w",encoding='utf-8') as fo:
        for s in chunk:
            fo.write(json.dumps(s, ensure_ascii=False) + "\n")
print("Created splits in", OUT_DIR)


Created splits in data_combined


In [None]:
# ==============================================
#  Arabert
# ==============================================

!pip install -U transformers datasets accelerate --quiet

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import json
import numpy as np

# -----------------------------
# Load label map
# -----------------------------
with open("merged_label2id.json", encoding="utf-8") as f:
    label2id = json.load(f)
id2label = {v: k for k, v in label2id.items()}

print("Loaded label map:", len(label2id))

# -----------------------------
# Load dataset (train/val/test)
# -----------------------------
def load_jsonl(path):
    out = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            out.append(json.loads(line))
    return out

train = Dataset.from_list(load_jsonl("data_combined/train.jsonl"))
val   = Dataset.from_list(load_jsonl("data_combined/val.jsonl"))
test  = Dataset.from_list(load_jsonl("data_combined/test.jsonl"))

# -----------------------------
# Choose model
# -----------------------------
model_name = "aubmindlab/bert-base-arabertv02"




tokenizer = AutoTokenizer.from_pretrained(model_name)

# -----------------------------
# Tokenize + Align Labels
# -----------------------------
def tokenize_and_align(batch):
    enc = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    aligned_labels = []
    for i, labels in enumerate(batch["labels"]):
        word_ids = enc.word_ids(batch_index=i)
        prev_id = None
        label_ids = []
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            elif wid != prev_id:
                label_ids.append(label2id[labels[wid]])
            else:
                label_ids.append(-100)
            prev_id = wid
        aligned_labels.append(label_ids)

    enc["labels"] = aligned_labels
    return enc

train_tok = train.map(tokenize_and_align, batched=True, remove_columns=["tokens","labels"])
val_tok   = val.map(tokenize_and_align,   batched=True, remove_columns=["tokens","labels"])
test_tok  = test.map(tokenize_and_align,  batched=True, remove_columns=["tokens","labels"])


# -----------------------------
# Load Model
# -----------------------------
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# -----------------------------
# TrainingArguments (VALID)
# -----------------------------
args = TrainingArguments(
    output_dir="lemmatizer_output",
    eval_strategy="epoch",   # <--- VALID in 4.57.3
    save_strategy="epoch",         # <--- VALID
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=1,
)

# -----------------------------
# Metrics
# -----------------------------
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)

    true_preds = []
    true_labels = []

    for p_row, l_row in zip(preds, labels):
        for p, l in zip(p_row, l_row):
            if l != -100:
                true_preds.append(p)
                true_labels.append(l)

    return {"accuracy": (np.array(true_preds) == np.array(true_labels)).mean()}


# -----------------------------
# Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# Train
# -----------------------------
trainer.train()

trainer.save_model("lemmatizer_final")
tokenizer.save_pretrained("lemmatizer_final")

print("\nTraining complete! Model saved to lemmatizer_final/")


Loaded label map: 8102


Map:   0%|          | 0/4972 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,3.0081,2.559235,0.671769
2,1.8833,1.71696,0.779511
3,1.3544,1.373241,0.830291
4,1.0759,1.196966,0.854136
5,0.8922,1.103389,0.864145
6,0.809,1.052548,0.871357
7,0.7483,1.03904,0.873271



Training complete! Model saved to lemmatizer_final/
