In [8]:
# step1_ingest_preprocess.py
# Python 3.9+
# Install dependencies (if needed):
# pip install langchain[all] regex

# Install langchain if not already installed
# %pip install -U langchain


import json
import re
from pathlib import Path
from typing import List, Dict
from langchain.schema import Document


In [9]:
# ---------- Arabic normalization utilities ----------
def normalize_arabic(text: str) -> str:
    """
    Light Arabic normalization for retrieval.
    - unify alef variants, remove tatweel, normalize yah/taa marbuta, strip extra whitespace.
    Keep punctuation and numbers (they can be useful in legal references).
    """
    if not text:
        return ""

    # Basic replacements
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "و", text)
    text = re.sub(r"ئ", "ي", text)
    # Keep taa marbuta? In many retrieval setups we convert to ه -> but better to keep as ة.
    # If you prefer aggressive normalization, uncomment the next line:
    # text = re.sub(r"ة", "ه", text)
    # Remove tatweel
    text = text.replace("ـ", "")
    # Remove diacritics (harakat) — usually improves retrieval
    # Arabic diacritics Unicode block:  0610-061A, 064B-065F, 0670
    text = re.sub(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]", "", text)
    # Normalize multiple spaces and newlines
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [10]:
# ---------- Flattening loader ----------
def flatten_laws(data: Dict) -> List[Dict]:
    """
    Input: nested JSON structure like your sample:
      {
        "أنظمة أساسية": {
           "الأنظمة الأساسية": {
               "النظام الأساسي للحكم": [
                   {"Article_Title": "...", "Article_Text": "..."},
                   ...
               ]
           }
        }
      }
    Output: list of article dicts with canonical metadata and normalized text.
    """
    entries = []
    for category, subcats in data.items():
        # If subcats is not a dict, skip (defensive)
        if not isinstance(subcats, dict):
            continue
        for subcat, laws in subcats.items():
            if not isinstance(laws, dict):
                continue
            for law_name, articles in laws.items():
                if not isinstance(articles, list):
                    continue
                for idx, article in enumerate(articles):
                    title = article.get("Article_Title") or article.get("title") or f"مادة_{idx+1}"
                    text_raw = article.get("Article_Text") or article.get("text") or ""
                    text_norm = normalize_arabic(text_raw)

                    # Build canonical source id (safe filename-ish)
                    def safe_id(s: str) -> str:
                        s = s.replace("/", " - ").strip()
                        s = re.sub(r"\s+", " ", s)
                        return s

                    source_id = f"{safe_id(category)}/{safe_id(subcat)}/{safe_id(law_name)}/{safe_id(title)}"

                    entry = {
                        "category": category,
                        "subcategory": subcat,
                        "law_name": law_name,
                        "article_title": title,
                        "article_text_raw": text_raw,
                        "article_text": text_norm,
                        "source_id": source_id,
                        "article_index": idx
                    }
                    entries.append(entry)
    return entries


In [11]:
# ---------- Convert to LangChain Documents ----------
def entries_to_documents(entries: List[Dict]) -> List[Document]:
    docs = []
    for e in entries:
        metadata = {
            "category": e["category"],
            "subcategory": e["subcategory"],
            "law_name": e["law_name"],
            "article_title": e["article_title"],
            "source_id": e["source_id"],
            "article_index": e["article_index"],
            # You can add effective_date, version, jurisdiction, etc. if available
        }
        # We store the normalized text as page_content for retrieval.
        # Keep raw text in metadata if you want to display original formatting later.
        doc = Document(page_content=e["article_text"], metadata={**metadata, "raw_text": e["article_text_raw"]})
        docs.append(doc)
    return docs


In [14]:
# ---------- Main runner ----------
def main(input_json_path: str = "laws.json", output_docs_path: str = "docs.json"):
    p = Path(input_json_path)
    if not p.exists():
        raise FileNotFoundError(f"Input file not found: {p.resolve()}")

    data = json.loads(p.read_text(encoding="utf-8"))
    entries = flatten_laws(data)
    print(f"Flattened into {len(entries)} articles.")

    docs = entries_to_documents(entries)

    # Save lightweight JSON for inspection (not a serialized langchain Document object)
    docs_serializable = []
    for d in docs:
        docs_serializable.append({
            "page_content": d.page_content,
            "metadata": d.metadata
        })

    Path(output_docs_path).write_text(json.dumps(docs_serializable, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Saved {len(docs_serializable)} docs to {output_docs_path}")

if __name__ == "__main__":
    # change path if needed
    main("../laws.json", "laws_docs.json")

Flattened into 16526 articles.
Saved 16526 docs to laws_docs.json
