## Primary_Source_Import_Helper.ipynb

This notebook is desinged to assist in importing all source files to poupulate the source file folder to allow the FACTR_02_KB_Ingest_lang.ipynb to ingest the content into the FAISS KB Archive.

## 0) Mount Drive (if not already)

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## 1) Clone the repo locally (fast)

(You can also download the zip and unzip; cloning is simpler.)

In [None]:
!rm -rf /content/quranjson
!git clone -q https://github.com/semarketir/quranjson.git /content/quranjson
!ls -R /content/quranjson/source | head -n 50


/content/quranjson/source:
audio
juz.json
surah
surah.json
tajweed
translation

/content/quranjson/source/audio:
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041


You should see folders like:



In [None]:
# source/
#   surah/                # Arabic per-sūrah JSON
#   translation/
#     en/                 # English per-sūrah JSON
#   surah.json            # index/metadata per sūrah

## 2) Convert AR + EN to your JSONL schema [link text](https://)
This script:

reads Arabic from source/surah/surah_<1..114>.json

reads English from source/translation/en/en_translation_<1..114>.json

uses source/surah.json to grab friendly sūrah names (book)

writes two JSONL files into your Drive under /FATCR/data/raw/kb/Islam/Quran/

In [None]:
# --- CONFIG ---
ROOT = "/content/drive/MyDrive/FATCR"
BASE = "/content/quranjson/source"   # semarketir/quranjson cloned here
OUT_DIR = f"{ROOT}/data/raw/kb/Islam/Quran"
AR_OUT = f"{OUT_DIR}/quran_ar_semarketir.jsonl"
EN_OUT = f"{OUT_DIR}/quran_en_semarketir.jsonl"

import os, json, re
os.makedirs(OUT_DIR, exist_ok=True)

def _clean(s):
    return re.sub(r"\s+", " ", str(s)).strip()

# In this repo, surah index is a LIST with English name in "title"
SURAH_INDEX = json.load(open(f"{BASE}/surah.json", encoding="utf-8"))
def surah_name_en(n: int) -> str:
    i = n - 1
    if 0 <= i < len(SURAH_INDEX) and isinstance(SURAH_INDEX[i], dict):
        r = SURAH_INDEX[i]
        return r.get("title") or r.get("english_name") or r.get("name") or f"Surah {n}"
    return f"Surah {n}"

def extract_verses(obj):
    """
    Return list[(verse_number:int, text:str)] from two common shapes:
      A) {"verse": {"verse_1": "...", "verse_2": "...", ...}}
      B) {"verse": [{"number": 1, "verse": "..."} ...]}  (or top-level list)
    """
    if obj is None:
        return []

    # A) dict of "verse_x"
    if isinstance(obj, dict) and isinstance(obj.get("verse"), dict):
        d = obj["verse"]
        items = []
        for k, v in d.items():
            # grab the trailing number from keys like "verse_7"
            m = re.search(r"(\d+)$", k)
            num = int(m.group(1)) if m else None
            txt = _clean(v)
            if txt:
                items.append((num, txt))
        # sort by verse number (fallback to order if None)
        items.sort(key=lambda x: (999999 if x[0] is None else x[0]))
        # fill missing numbers by position
        items = [(i+1 if n is None else n, t) for i, (n, t) in enumerate(items)]
        return items

    # B) array-like (list of dicts or strings)
    def pick_array(o):
        if isinstance(o, dict):
            for k in ["verses", "ayahs", "verse", "data", "list", "result", "aya"]:
                v = o.get(k)
                if isinstance(v, list) and v:
                    return v
        if isinstance(o, list) and o:
            return o
        return []

    arr = pick_array(obj)
    out = []
    if not arr:
        return out

    if isinstance(arr[0], dict):
        for i, it in enumerate(arr, start=1):
            vnum = it.get("number") or it.get("index") or it.get("aya")
            try: vnum = int(vnum)
            except: vnum = i
            txt = it.get("verse") or it.get("text") or it.get("translation") or it.get("content") or ""
            txt = _clean(txt)
            if txt:
                out.append((vnum, txt))
    else:
        for i, s in enumerate(arr, start=1):
            txt = _clean(s)
            if txt:
                out.append((i, txt))
    return out

# Re-convert AR + EN -> JSONL
open(AR_OUT, "w").close()
open(EN_OUT, "w").close()
ar_written = en_written = 0

for s in range(1, 115):
    ar_path = f"{BASE}/surah/surah_{s}.json"
    en_path = f"{BASE}/translation/en/en_translation_{s}.json"
    if not (os.path.exists(ar_path) and os.path.exists(en_path)):
        print(f"Skipping surah {s} (missing AR or EN file)")
        continue

    ar_obj = json.load(open(ar_path, encoding="utf-8"))
    en_obj = json.load(open(en_path, encoding="utf-8"))

    ar_verses = extract_verses(ar_obj)
    en_verses = extract_verses(en_obj)

    ar_map = {v: t for v, t in ar_verses}
    en_map = {v: t for v, t in en_verses}
    book = surah_name_en(s)

    with open(AR_OUT, "a", encoding="utf-8") as far, open(EN_OUT, "a", encoding="utf-8") as fen:
        for v in sorted(ar_map.keys()):
            far.write(json.dumps({
                "tradition":"Islam","genre":"scripture","source":"Quran (AR)","collection":"Quran",
                "book":book,"chapter":s,"verse":int(v),
                "number":None,"grade":None,"lang":"ar","ref":f"Qur'an {s}:{int(v)}",
                "text":ar_map[v]
            }, ensure_ascii=False) + "\n")
            ar_written += 1

            if v in en_map:
                fen.write(json.dumps({
                    "tradition":"Islam","genre":"scripture",
                    "source":"Quran (EN: semarketir/quranjson)","collection":"Quran",
                    "book":book,"chapter":s,"verse":int(v),
                    "number":None,"grade":None,"lang":"en","ref":f"Qur'an {s}:{int(v)}",
                    "text":en_map[v]
                }, ensure_ascii=False) + "\n")
                en_written += 1

print("Wrote:", AR_OUT, "| lines:", ar_written)
print("Wrote:", EN_OUT, "| lines:", en_written)

# Quick sanity check - Peek a few lines to be sure
for p in (AR_OUT, EN_OUT):
    print("\n==", p, "==")
    print("exists:", os.path.exists(p), "| size:", os.path.getsize(p), "bytes")
    with open(p, encoding="utf-8") as f:
        for i, line in enumerate(f):
            print(line.strip()[:180])
            if i == 3: break



Wrote: /content/drive/MyDrive/FATCR/data/raw/kb/Islam/Quran/quran_ar_semarketir.jsonl | lines: 6348
Wrote: /content/drive/MyDrive/FATCR/data/raw/kb/Islam/Quran/quran_en_semarketir.jsonl | lines: 6348

== /content/drive/MyDrive/FATCR/data/raw/kb/Islam/Quran/quran_ar_semarketir.jsonl ==
exists: True | size: 2746818 bytes
{"tradition": "Islam", "genre": "scripture", "source": "Quran (AR)", "collection": "Quran", "book": "Al-Fatiha", "chapter": 1, "verse": 1, "number": null, "grade": null, "lang": "a
{"tradition": "Islam", "genre": "scripture", "source": "Quran (AR)", "collection": "Quran", "book": "Al-Fatiha", "chapter": 1, "verse": 2, "number": null, "grade": null, "lang": "a
{"tradition": "Islam", "genre": "scripture", "source": "Quran (AR)", "collection": "Quran", "book": "Al-Fatiha", "chapter": 1, "verse": 3, "number": null, "grade": null, "lang": "a
{"tradition": "Islam", "genre": "scripture", "source": "Quran (AR)", "collection": "Quran", "book": "Al-Fatiha", "chapter": 1, "verse": 

## 3) Quick sanity check (peek a few lines)

## 4) Run your ingest notebook

Now open FACTR_02_KB_Ingest_lang.ipynb and run top → bottom.
It will harvest the two JSONL files automatically from: