In [None]:
from core.helpers import display_verses_with_codepoints
display_verses_with_codepoints(["0x06D7", "0x06DA"])

In [None]:
from core.phonemizer import Phonemizer

pm = Phonemizer()
ref = "44:43 - 44:44"

res = pm.phonemize(ref, stops=["verse"])
print(res.text)
print(res.phonemes_str(phoneme_sep="", word_sep=" ", verse_sep=""))

res = pm.phonemize(ref, stops=[])
print(res.phonemes_str(phoneme_sep="", word_sep=" ", verse_sep=""))

In [None]:
# --- CONFIG -----------------------------------------------------------
QURAN_JSON = "data/Quran.json"        # ↙ update if the file lives elsewhere
# ---------------------------------------------------------------------

import json, re, unicodedata
from pathlib import Path
import pandas as pd

TAG_RE = re.compile(r"</?rule[^>]*?>", flags=re.IGNORECASE)
HARAKAT_RNG = '[\u064B-\u0650]'          # fatḥatan–kasra  (U+064B..U+0650)
SHADDA = '\u0651'
LAM    = '\u0644'
MEEM   = '\u0645'
NUN    = '\u0646'

import unicodedata, re

COMBINING = re.compile(r'[\u064B-\u0652\u0651]')      # tanwīn + harakāt + sukūn + shadda

def last_bare_consonant(word: str) -> str | None:
    """
    Return the last consonant **only if** it is not followed by
    ANY Arabic combining mark (tanwīn, short vowel, sukūn, shadda).
    Otherwise return None.
    """
    # walk backwards through the string
    for i in range(len(word) - 1, -1, -1):
        ch = word[i]
        if '\u0621' <= ch <= '\u064A':            # Arabic base letter
            if i == len(word) - 1 or not COMBINING.match(word[i + 1]):
                return ch                         # bare consonant found
            return None                           # consonant has a mark → reject
    return None                                   # no consonant at all


def strip_tags(text: str) -> str:
    """Drop <rule …> wrappers; JSON sometimes stores text as list-segments."""
    if isinstance(text, list):
        text = "".join(text)
    return TAG_RE.sub("", text)

def base_letters(word: str) -> str:
    """Remove Arabic combining marks to leave only consonant code-points."""
    return "".join(ch for ch in word if not unicodedata.combining(ch))

def last_base_letter(word: str) -> str | None:
    """Return last consonant (ignoring harakāt) or None."""
    bases = [ch for ch in base_letters(word) if "\u0621" <= ch <= "\u064A"]
    return bases[-1] if bases else None

# ---------------------------------------------------------------------
data = json.loads(Path(QURAN_JSON).read_text(encoding="utf-8"))
# order by running index so consecutive items are truly “next word”
words = sorted(data.values(), key=lambda d: d["word_index"])

pairs: list[tuple[str, str, str, str]] = []

for w1, w2 in zip(words, words[1:]):
    txt1, txt2 = strip_tags(w1["text"]), strip_tags(w2["text"])
    last = last_bare_consonant(txt1)
    if not last or last in [MEEM, NUN]:                       
        continue
    # does the next word begin with  last + shadda + vowel ?
    if re.match(fr"{re.escape(last)}{SHADDA}{HARAKAT_RNG}", txt2):
        pairs.append((w1["location"], txt1, w2["location"], txt2))

# --- Display ----------------------------------------------------------
df = pd.DataFrame(pairs, columns=["Ref-A", "Word-A", "Ref-B", "Word-B"])
print(f"Found {len(df)} cross-word Idghām Mutamāthilayn hits")
df
# write the df to file

# --- Dump to a plain-text file --------------------------------------
outfile = "data/mutamathilayn.txt"

with open(outfile, "w", encoding="utf-8") as fh:
    for _, r in df.iterrows():
        fh.write(f"{r['Ref-A']}\t{r['Word-A']}\t→\t{r['Ref-B']}\t{r['Word-B']}\n")

print(f"Wrote {len(df)} pairs to {outfile}")


In [None]:
# === Cell : tag *first* consonant of cross-word Idghām Mutamāthilayn Ṣaghīr ======

"""
• Reads  : data/Quran.json
• Needs  : data/mutamathilayn.txt   (refA<TAB>wordA<TAB>→<TAB>refB<TAB>wordB)
• Writes : data/Quran_idgham.json
   – Adds <rule class=idgham_mutamathilayn>…</rule> only on the *silent* lam/meem-free
     consonant at the end of word-A.
"""

import json, re, unicodedata
from copy import deepcopy
from pathlib import Path

QURAN_IN   = "data/Quran.json"
PAIRS_IN   = "data/mutamathilayn.txt"
QURAN_OUT  = "data/Quran_v22.json"

TAG_RE = re.compile(r"</?rule[^>]*?>", flags=re.I)

# ------------------------------------------------------------------ helpers
def last_bare_idx(text: str) -> int | None:
    """index of last consonant with **no** following mark; honours rule-tags."""
    i = len(text) - 1
    while i >= 0:
        if text[i] == ">":                 # walk back over a tag
            i = text.rfind("<", 0, i) - 1
            continue
        ch = text[i]
        if "\u0621" <= ch <= "\u064A":     # Arabic base letter
            j = i + 1
            while j < len(text) and unicodedata.combining(text[j]):
                j += 1
            if j >= len(text) or text[j] == "<":     # nothing after → bare
                return i
        i -= 1
    return None

def tag_word_a(rec: dict) -> None:
    """Wrap the last bare consonant of word-A only."""
    t = rec["text"]
    idx = last_bare_idx(t)
    if idx is not None:
        rec["text"] = (
            f"{t[:idx]}<rule class=idgham_mutamathilayn>{t[idx]}</rule>{t[idx+1:]}"
        )

# ------------------------------------------------------------------ main
with open(QURAN_IN, encoding="utf-8") as fh:
    quran = json.load(fh)

pairs: list[tuple[str, str]] = []
with open(PAIRS_IN, encoding="utf-8") as fh:
    for ln in fh:
        if ln.strip():
            ref_a, _, _, ref_b, _ = ln.rstrip("\n").split("\t")
            pairs.append((ref_a, ref_b))

quran_out = deepcopy(quran)
for ref_a, _ in pairs:          # only word-A modified
    tag_word_a(quran_out[ref_a])

with open(QURAN_OUT, "w", encoding="utf-8") as fh:
    json.dump(quran_out, fh, ensure_ascii=False, indent=2)

print(f"✓ Tagged {len(pairs)} first-consonants → {QURAN_OUT}")


In [None]:
# === Cell : detect Lam Shamsiyyah (silent article-lam) ==========================

"""
Criteria:
  • LAM (ل) with *no* sukūn / vowel mark
  • immediately followed by a SUN-LETTER (ت ث د ذ ر ز س ش ص ض ط ظ ل ن)
    that carries a SHADDA
This cell:
  1. loads data/Quran.json
  2. finds every word that meets the pattern
  3. writes tab-separated list to data/lam_shamsi.txt
  4. shows first rows as a DataFrame
"""

import json, re, unicodedata, pandas as pd
from pathlib import Path

# ── paths ───────────────────────────────────────────────────────────────────
IN_JSON = "data/Quran.json"
OUT_TXT = "data/lam_shams.txt"

# ── glyphs & helpers ────────────────────────────────────────────────────────
LAM        = "\u0644"
SHADDA     = "\u0651"
SUN        = set("تثدذرزسشصضطظللن")

COMB_RE = re.compile(r"[\u064B-\u0652\u0651]")       # all harakāt + sukūn + shadda
TAG_RE  = re.compile(r"</?rule[^>]*?>", flags=re.I)

def strip_tags(x: str) -> str:
    return TAG_RE.sub("", "".join(x) if isinstance(x, list) else x)

def has_lam_shamsi(word: str) -> bool:
    w = strip_tags(word)
    i = 0
    while i < len(w):
        if w[i] == LAM and (i + 1 == len(w) or not unicodedata.combining(w[i + 1])):
            # next base letter
            j = i + 1
            while j < len(w) and unicodedata.combining(w[j]):
                j += 1
            if j < len(w) and w[j] in SUN:
                return True
        # skip over tags cleanly
        i = w.find(">", i) + 1 if w[i] == "<" else i + 1
    return False

# ── scan Qur’an ─────────────────────────────────────────────────────────────
data = json.loads(Path(IN_JSON).read_text(encoding="utf-8"))
hits = [(ref, strip_tags(d["text"])) for ref, d in data.items() if has_lam_shamsi(d["text"])]

print(f"Found {len(hits)} lam-shamsiyyah words")

# write file
Path("data").mkdir(exist_ok=True)
with open(OUT_TXT, "w", encoding="utf-8") as f:
    for ref, word in hits:
        f.write(f"{ref}\t{word}\n")
print(f"Saved list → {OUT_TXT}")

# preview
pd.DataFrame(hits, columns=["Reference", "Word"]).head(20)


In [None]:
# === Cell : wrap article-lam with <rule class=lam_shamsiyah> =====================

"""
• Reads  : data/Quran.json  +  data/lam_shamsi.txt (list from earlier cell)
• Writes : data/Quran_lam_shamsi.json
   – Inserts <rule class=lam_shamsiyah>…</rule> around the **lam**
   – Skips words already tagged
"""

import json, re, unicodedata
from copy import deepcopy
from pathlib import Path

# — paths —
IN_JSON  = "data/Quran_v2.json"
LIST_TXT = "data/lam_shams.txt"
OUT_JSON = "data/Quran_v2_lam_shams.json"

# — glyphs —
LAM, SHADDA = "\u0644", "\u0651"
SUN = set("تثدذرزسشصضطظللن")

TAG_RE = re.compile(r"</?rule[^>]*?>", flags=re.I)

def strip_tags(t: str) -> str:
    return TAG_RE.sub("", "".join(t) if isinstance(t, list) else t)

def add_tag(text: str) -> (str, bool):
    """Wrap first silent article-lam (if not already tagged)."""
    if "class=laam_shamsiyah" in text:
        return text, True

    s = text
    i = 0
    while i < len(s):
        if s[i] == "<":
            i = s.find(">", i) + 1
            continue

        if s[i] == LAM and (i + 1 == len(s) or not unicodedata.combining(s[i + 1])):
            # find next base letter
            j = i + 1
            while j < len(s) and unicodedata.combining(s[j]):
                j += 1
            if j < len(s) and s[j] in SUN:
                # # confirm shadda on that sun-letter
                # k = j + 1
                # while k < len(s) and unicodedata.combining(s[k]):
                #     if s[k] == SHADDA:
                return f"{s[:i]}<rule class=laam_shamsiyah>{LAM}</rule>{s[i+1:]}", True
                    # k += 1
        i += 1
    return text, False

# — load data & list —
quran = json.loads(Path(IN_JSON).read_text(encoding="utf-8"))
refs  = {ln.split("\t")[0] for ln in Path(LIST_TXT).read_text("utf-8").splitlines() if ln.strip()}

quran_new = deepcopy(quran)
num_tagged = 0
skipped = 0
for ref in refs:
    text, tagged = add_tag(quran_new[ref]["text"])
    if tagged:
        quran_new[ref]["text"] = text
        num_tagged += 1
    else:
        skipped += 1
        print(f"skipped {ref}")

# — write —
Path("data").mkdir(exist_ok=True)
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(quran_new, f, ensure_ascii=False, indent=2)

print(f"✓ Skipped {skipped} words")
print(f"✓ Tagged {num_tagged} words")
print(f"✓ Tagged {len(refs)} lam-shamsiyyah words → {OUT_JSON}")

In [None]:
# === Cell : final Lam-Shamsiyyah tagging (handles inner tags & tatwīl) ============

import json, re, unicodedata
from pathlib import Path
from copy import deepcopy

IN_JSON   = "data/Quran_v2.json"
LIST_TXT  = "data/lam_shams.txt"          # 5 283 refs
OUT_JSON  = "data/Quran_v2_lam_shams.json"

LAM       = "\u0644"
SHADDA    = "\u0651"
TATWEEL   = "\u0640"
SUN       = set("تثدذرزسشصضطظللن")
TAG_RE    = re.compile(r"</?rule[^>]*?>", flags=re.I)

def strip_tags(x: str) -> str:
    return TAG_RE.sub("", "".join(x) if isinstance(x, list) else x)

def tag_lam(word: str) -> (str, bool):
    if "class=laam_shamsiyah" in word:
        return word, True

    s = word
    cand = None
    i = 0
    while i < len(s):
        ch = s[i]
        if ch == "<":                          # skip tag
            i = s.find(">", i) + 1
            continue

        if ch == LAM and (i + 1 == len(s) or not unicodedata.combining(s[i + 1])):
            j = i + 1
            while j < len(s):
                cj = s[j]
                if cj == "<":
                    j = s.find(">", j) + 1
                    continue
                if cj == TATWEEL or unicodedata.combining(cj):
                    j += 1
                    continue
                break
            if j < len(s) and s[j] in SUN:
                cand = i          # keep the _last_ matching lam
        i += 1

    if cand is not None:
        idx = cand
        return (
            f"{s[:idx]}<rule class=laam_shamsiyah>{LAM}</rule>{s[idx+1:]}",
            True,
        )
    return word, False

# ── run tagging ─────────────────────────────────────────────────────────
quran = json.loads(Path(IN_JSON).read_text(encoding="utf-8"))
refs  = {ln.split("\t")[0] for ln in Path(LIST_TXT).read_text().splitlines() if ln.strip()}

quran_new, tagged, skipped = deepcopy(quran), 0, 0
for ref in refs:
    txt, ok = tag_lam(quran_new[ref]["text"])
    quran_new[ref]["text"] = txt
    tagged, skipped = tagged + ok, skipped + (not ok)

# ── save & report ───────────────────────────────────────────────────────
Path("data").mkdir(exist_ok=True)
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(quran_new, f, ensure_ascii=False, indent=2)

print(f"Tagged : {tagged}")
print(f"Skipped: {skipped}  (should be 0)")
print(f"→ {OUT_JSON}")


In [None]:
from core.phonemizer import Phonemizer

ref = "7:145"
res = Phonemizer().phonemize(ref, stops=["verse"])
res.show_table()

In [1]:
from core.helpers import phonemize_and_save
s="1-114"
phonemize_and_save(f"{s}", 
    stops=["verse", "preferred_stop", "compulsory_stop", 
        # "optional_stop", "preferred_continue"
    ], 
    output_dir="out/phonemized_refactor1_vqm"
)

Phonemized output saved to: out/phonemized_refactor1_vqm/1-114.txt


In [2]:
from core.helpers import compare_files
compare_files(f"out/phonemized_v1/1-114.txt", f"out/phonemized_refactor1_vqm/1-114.txt",    
ignore_whitespace=True)

Files differ: 1-114.txt vs 1-114.txt
--- out/phonemized_v1/1-114.txt
+++ out/phonemized_refactor1_vqm/1-114.txt
@@ -1,17 +1,18 @@
+1:1
 بِسۡمِ                 ['b', 'i', 's', 'm', 'i']
 ٱللَّهِ                ['ll', 'a', 'h', 'i']
-ٱلرَّحۡمَـٰنِ             ['rr', 'a', 'ħ', 'm', 'a:', 'n', 'i']
+ٱلرَّحۡمَـٰنِ             ['rr', 'a', 'ħ', 'm', 'a', 'n', 'i']
 ٱلرَّحِيمِ              ['rr', 'a', 'ħ', 'i:', 'm']
 1:2
 ٱلۡحَمۡدُ               ['ʔ', 'a', 'l', 'ħ', 'a', 'm', 'd', 'u']
 لِلَّهِ                 ['l', 'i', 'll', 'a', 'h', 'i']
 رَبِّ                  ['r', 'a', 'bb', 'i']
-ٱلۡعَـٰلَمِينَ            ['l', 'ʕ', 'a:', 'l', 'a', 'm', 'i:', 'n']
+ٱلۡعَـٰلَمِينَ            ['l', 'ʕ', 'a', 'l', 'a', 'm', 'i:', 'n']
 1:3
-ٱلرَّحۡمَـٰنِ             ['ʔ', 'a', 'rr', 'a', 'ħ', 'm', 'a:', 'n', 'i']
+ٱلرَّحۡمَـٰنِ             ['ʔ', 'a', 'rr', 'a', 'ħ', 'm', 'a', 'n', 'i']
 ٱلرَّحِيمِ              ['rr', 'a', 'ħ', 'i:', 'm']
 1:4
-مَـٰلِكِ                ['m', 'a:', 'l', 'i', 'k', 'i']
+مَـٰ

False