<a href="https://colab.research.google.com/github/MariemBjbr/portfolio-mariem/blob/main/extractPDF_To_JSOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install pymupdf pdfminer.six pytesseract pillow
!apt -q install -y tesseract-ocr >/dev/null






In [None]:
# Colab:
!pip -q install pymupdf==1.24.9


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# === AUTO LIB : Scan dossier -> PDF->JSONL->FIXED + index ===
!pip -q install pymupdf==1.24.9
import os, re, json, hashlib, unicodedata, time, fitz
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple

# ---------- Normalisation forte ----------
ZW = {"\u200B","\u200C","\u200D","\u2060","\uFEFF"}
LIG={"ﬀ":"ff","ﬁ":"fi","ﬂ":"fl","ﬃ":"ffi","ﬄ":"ffl","ﬅ":"ft","ﬆ":"st"}
QUO={"“":'"',"”":'"',"„":'"',"«":'"',"»":'"',"‘":"'", "’":"'", "‚":"'"}
DASH={"‒":"-","‐":"-","–":"-","—":"-","−":"-","\u2010":"-","\u2011":"-","\u2012":"-","\u2212":"-"}
def norm_strong(s:str)->str:
    s = unicodedata.normalize("NFKC", s)
    for z in ZW: s = s.replace(z,"")
    s = s.replace("\xa0"," ")
    for m in (LIG,QUO,DASH):
        for k,v in m.items(): s = s.replace(k,v)
    s = s.replace("…","...")
    s = re.sub(r"\s+([,.;:!?%)\]}])", r"\1", s)
    s = re.sub(r"([([{])\s+", r"\1", s)
    s = re.sub(r"\s+%","%", s)
    s = re.sub(r"[ \t]+"," ", s).strip()
    return s

def _sha(s:str)->str: return hashlib.sha256(s.encode("utf-8")).hexdigest()

# ---------- Extraction "lossless" ----------
@dataclass
class Word:
    text: str; x0: float; y0: float; x1: float; y1: float; block: int; line: int; wno: int

def get_words_lossless(page):
    out, seen = [], set()
    for (x0,y0,x1,y1,txt,b,l,w) in page.get_text("words"):
        if not txt: continue
        key=(round(x0,2),round(y0,2),round(x1,2),round(y1,2),txt,b,l,w)
        if key in seen: continue
        seen.add(key); out.append(Word(txt,x0,y0,x1,y1,b,l,w))
    out.sort(key=lambda w:(w.block,w.line,w.wno,w.x0,w.y0))
    return out

PUNCT_STICKY={",",".",";",";",":","!","?",")","]","}","%"}
PUNCT_OPEN=set(["(","[","{","«","“","‘"])
def join_words_into_text(words: List[Word]) -> str:
    res=[]; prev=None
    for w in words:
        t=w.text
        if not res: res.append(t)
        else:
            if t in PUNCT_STICKY: res.append(t)
            elif prev and prev[-1] in PUNCT_OPEN: res.append(t)
            else: res.append(" "+t)
        prev=t
    return norm_strong("".join(res))

def page_to_json(page_idx, page):
    words = get_words_lossless(page)
    text  = join_words_into_text(words)
    return {
        "page_index": page_idx,
        "size": {"width": page.rect.width, "height": page.rect.height},
        "words": [asdict(w) for w in words],
        "text": text,
        "stats": {"n_words": len(words), "n_chars": len(text), "sha256": _sha(text)}
    }

# ---------- Conversion + manifest ----------
def pdf_to_jsonl(pdf_path:str, out_jsonl:str, manifest_path:str):
    os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True)
    doc = fitz.open(pdf_path)
    report=[]; ours_tot=0; ref_tot=0; ok=0
    with open(out_jsonl,"w",encoding="utf-8") as f:
        for i in range(len(doc)):
            page=doc[i]
            pj=page_to_json(i,page)
            ours = pj["text"]
            ref  = norm_strong(page.get_text("text").replace("\r","\n").replace("\n"," "))
            same = _sha(ours)==_sha(ref)
            if same: ok+=1
            ours_tot += len(ours); ref_tot += len(ref)
            f.write(json.dumps(pj, ensure_ascii=False)+"\n")
            pref=len(os.path.commonprefix([ours,ref])); cov=round(pref/max(1,len(ref)),6)
            report.append({"page_index":i,"same_len":len(ours)==len(ref),"same_sha":same,
                           "coverage_char_ratio":cov,"n_chars_ours":len(ours),"n_chars_ref":len(ref)})
    doc.close()
    manifest={"pdf_path":pdf_path,"n_pages":len(report),"pages_ok_same_sha":ok,
              "global_chars_ours":ours_tot,"global_chars_ref":ref_tot,
              "global_same_charcount":ours_tot==ref_tot,"pages_report":report}
    with open(manifest_path,"w",encoding="utf-8") as mf:
        json.dump(manifest, mf, ensure_ascii=False, indent=2)

# ---------- Fix pages douteuses (fallback contrôlé) ----------
def make_authoritative_jsonl(pdf_path:str, in_jsonl:str, out_jsonl:str, cov_threshold=0.98):
    doc=fitz.open(pdf_path)
    with open(in_jsonl,encoding="utf-8") as fi, open(out_jsonl,"w",encoding="utf-8") as fo:
        for line in fi:
            o=json.loads(line)
            p=o["page_index"]
            ours=norm_strong(o["text"])
            ref =norm_strong(doc[p].get_text("text").replace("\r","\n").replace("\n"," "))
            if _sha(ours)!=_sha(ref):
                pref=len(os.path.commonprefix([ours,ref])); cov=pref/max(1,len(ref))
                if cov<cov_threshold:
                    o["text"]=ref
                    o.setdefault("debug",{})["fallback_applied"]=True
                    o["debug"]["coverage_before"]=round(cov,6)
            fo.write(json.dumps(o, ensure_ascii=False)+"\n")
    doc.close()

# ---------- Utils scan ----------
def list_pdfs(input_dir:str)->List[str]:
    pdfs=[]
    for root,_,files in os.walk(input_dir):
        for fn in files:
            if fn.lower().endswith(".pdf"):
                pdfs.append(os.path.join(root, fn))
    return sorted(pdfs)

def relpath_without_ext(path:str, base_dir:str)->str:
    rel = os.path.relpath(path, base_dir)
    return os.path.splitext(rel)[0]  # ex: "sub/a/b/c"

# ---------- Orchestrateur full-auto ----------
def process_pdf(pdf_path:str, input_dir:str, output_dir:str, cov_threshold=0.98)->Dict:
    rel = relpath_without_ext(pdf_path, input_dir)
    out_dir = os.path.join(output_dir, os.path.dirname(rel))
    base    = os.path.basename(rel)
    os.makedirs(out_dir, exist_ok=True)

    out_jsonl   = os.path.join(out_dir, f"{base}.jsonl")
    out_manifest= os.path.join(out_dir, f"{base}_manifest.json")
    out_fixed   = os.path.join(out_dir, f"{base}_fixed.jsonl")

    # skip si déjà à jour (fixed plus récent que le pdf)
    if os.path.exists(out_fixed) and os.path.getmtime(out_fixed) >= os.path.getmtime(pdf_path):
        # lire manifest pour stats si dispo, sinon fabriquer minimal
        manifest = {}
        if os.path.exists(out_manifest):
            with open(out_manifest, encoding="utf-8") as mf: manifest = json.load(mf)
        return {"pdf": pdf_path, "status":"skipped(up-to-date)", "outputs":{
            "jsonl": out_jsonl, "fixed_jsonl": out_fixed, "manifest": out_manifest
        }, "manifest": manifest}

    # étape 1: pdf -> jsonl + manifest
    pdf_to_jsonl(pdf_path, out_jsonl, out_manifest)
    # étape 2: autoritative fixed
    make_authoritative_jsonl(pdf_path, out_jsonl, out_fixed, cov_threshold=cov_threshold)

    # résumer pages corrigées
    fixed_pages=[]; total_pages=0
    with open(out_fixed, encoding="utf-8") as f:
        for line in f:
            o=json.loads(line); total_pages+=1
            if o.get("debug",{}).get("fallback_applied"): fixed_pages.append(o["page_index"])

    return {"pdf": pdf_path, "status":"processed", "pages_total": total_pages,
            "pages_fixed": fixed_pages, "outputs":{
                "jsonl": out_jsonl, "fixed_jsonl": out_fixed, "manifest": out_manifest
            }}

def process_all(input_dir:str="/content/in", output_dir:str="/content/out", cov_threshold=0.98)->Dict:
    os.makedirs(output_dir, exist_ok=True)
    pdfs = list_pdfs(input_dir)
    summary = {"input_dir": input_dir, "output_dir": output_dir, "count": len(pdfs), "files": []}
    for i,p in enumerate(pdfs,1):
        print(f"[{i}/{len(pdfs)}] {p}")
        try:
            res = process_pdf(p, input_dir, output_dir, cov_threshold)
        except Exception as e:
            res = {"pdf": p, "status": f"error: {e}"}
        summary["files"].append(res)

    # index.json pour tracer tout
    index_path = os.path.join(output_dir, "index.json")
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)
    print("OK index:", index_path)
    return summary


In [None]:

os.makedirs("/content/", exist_ok=True)
os.makedirs("/content/", exist_ok=True)

summary = process_all(input_dir="/content/", output_dir="/content/", cov_threshold=0.98)


print("Trouvés:", summary["count"], "PDFs")
for f in summary["files"][:10]:
    print(f["status"], "->", f.get("outputs",{}).get("fixed_jsonl"))


[1/4] /content/0018_LeveragingSecurity_DK_20160111.pdf
[2/4] /content/0019_AutomatingProt_DK_20160111.pdf
[3/4] /content/511399-UEN_CSDG_670_2p2.pdf
[4/4] /content/SIP5_Security_V10.00_Manual_C081-G_en.pdf
OK index: /content/index.json
Trouvés: 4 PDFs
processed -> /content/0018_LeveragingSecurity_DK_20160111_fixed.jsonl
processed -> /content/0019_AutomatingProt_DK_20160111_fixed.jsonl
processed -> /content/511399-UEN_CSDG_670_2p2_fixed.jsonl
processed -> /content/SIP5_Security_V10.00_Manual_C081-G_en_fixed.jsonl


**Étape 1 — Chunking sûr (depuis _fixed.jsonl)**

saute les pages vides (images)

coupe par paragraphes

cible ~1000 tokens (approx), overlap 120

garde la traçabilité des pages

In [72]:
import json, re, os

def rough_token_count(s:str)->int:
    # approx simple ~ 0.75 * nb_mots
    return int(0.75 * len(re.findall(r"\S+", s)))

def paragraph_iter(text:str):
    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
    return paras if paras else ([text.strip()] if text.strip() else [])

def build_chunks_from_fixed_jsonl(fixed_jsonl:str, target_tokens=1000, overlap_tokens=120, min_tokens=200):
    pages=[]
    with open(fixed_jsonl, encoding="utf-8") as f:
        for line in f:
            o=json.loads(line)
            if not o["text"].strip():   # ignore pages vides (images)
                continue
            pages.append((o["page_index"], o["text"]))

    chunks, buf, buf_tok, buf_pages = [], [], 0, set()
    for page_idx, text in pages:
        for para in paragraph_iter(text):
            t = rough_token_count(para)
            if buf_tok and buf_tok + t > target_tokens:
                chunks.append({"text":"\n\n".join(buf).strip(),
                               "approx_tokens":buf_tok,
                               "pages":sorted(buf_pages)})
                # overlap
                tail, tt = [], 0
                for p in reversed(buf):
                    pt = rough_token_count(p)
                    if tt + pt > overlap_tokens: break
                    tail.insert(0, p); tt += pt
                buf, buf_tok = tail, sum(rough_token_count(x) for x in tail)
                buf_pages = set()
            buf.append(para); buf_tok += t; buf_pages.add(page_idx)
    if buf:
        chunks.append({"text":"\n\n".join(buf).strip(),
                       "approx_tokens":buf_tok,
                       "pages":sorted(buf_pages)})

    # filtre micro-chunks
    chunks = [c for c in chunks if c["approx_tokens"] >= min_tokens or len(chunks)==1]
    return chunks

# === Exécuter (adapter base) ===
base = "/content/0018_LeveragingSecurity_DK_20160111"
fixed_jsonl = f"{base}_fixed.jsonl"
out_chunks = f"{base}_chunks.jsonl"

chunks = build_chunks_from_fixed_jsonl(fixed_jsonl, target_tokens=1000, overlap_tokens=120, min_tokens=200)
with open(out_chunks, "w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

print("OK chunks:", len(chunks), "->", out_chunks)
print("Exemple pages du 1er chunk:", chunks[0]["pages"][:10], "tokens~", chunks[0]["approx_tokens"])


OK chunks: 4 -> /content/0018_LeveragingSecurity_DK_20160111_chunks.jsonl
Exemple pages du 1er chunk: [0, 1, 2, 3] tokens~ 949


**Vérif “rien perdu” (par page + global)**

In [73]:
import json, fitz, re, unicodedata, hashlib

def _sha(s): return hashlib.sha256(s.encode("utf-8")).hexdigest()

ZW = {"\u200B","\u200C","\u200D","\u2060","\uFEFF"}
LIG={"ﬀ":"ff","ﬁ":"fi","ﬂ":"fl","ﬃ":"ffi","ﬄ":"ffl","ﬅ":"ft","ﬆ":"st"}
QUO={"“":'"',"”":'"',"„":'"',"«":'"',"»":'"',"‘":"'", "’":"'", "‚":"'"}
DASH={"‒":"-","‐":"-","–":"-","—":"-","−":"-","\u2010":"-","\u2011":"-","\u2012":"-","\u2212":"-"}
def norm_strong(s:str)->str:
    s = unicodedata.normalize("NFKC", s)
    for z in ZW: s = s.replace(z,"")
    s = s.replace("\xa0"," ")
    for m in (LIG,QUO,DASH):
        for k,v in m.items(): s = s.replace(k,v)
    s = s.replace("…","...")
    s = re.sub(r"\s+([,.;:!?%)\]}])", r"\1", s)
    s = re.sub(r"([([{])\s+", r"\1", s)
    s = re.sub(r"\s+%","%", s)
    s = re.sub(r"[ \t]+"," ", s).strip()
    return s

def verify_fixed_against_pdf(pdf_path:str, fixed_jsonl:str):
    doc = fitz.open(pdf_path)
    ok, tot = 0, 0
    diffs = []
    ours_sum = []; ref_sum = []
    with open(fixed_jsonl, encoding="utf-8") as f:
        for line in f:
            o = json.loads(line); p = o["page_index"]; tot += 1
            ours = norm_strong(o["text"])
            ref  = norm_strong(doc[p].get_text("text").replace("\r","\n").replace("\n"," "))
            if _sha(ours)==_sha(ref):
                ok += 1
            else:
                pref = len(os.path.commonprefix([ours,ref]))
                cov  = round(pref/max(1,len(ref)),6)
                diffs.append({"page_index":p,"coverage_char_ratio":cov,
                              "n_chars_ours":len(ours),"n_chars_ref":len(ref)})
            ours_sum.append(ours); ref_sum.append(ref)
    doc.close()
    global_ok = _sha("".join(ours_sum)) == _sha("".join(ref_sum))
    return {"pages_ok": ok, "pages_total": tot, "global_ok": global_ok, "bad_pages": diffs}

# EXEMPLE
pdf = "/content/0018_LeveragingSecurity_DK_20160111.pdf"
fixed = "/content/0018_LeveragingSecurity_DK_20160111_fixed.jsonl"
res = verify_fixed_against_pdf(pdf, fixed)
print(res)


{'pages_ok': 14, 'pages_total': 14, 'global_ok': True, 'bad_pages': []}


**Audit couverture des chunks (est-ce que les chunks couvrent tout le texte ?)**

In [74]:
import json, re

def _paragraphs(text, min_len=80):
    # Paragraphes = blocs séparés par >=1 ligne vide
    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
    return [p for p in paras if len(p) >= min_len]

def _union_len(spans):
    if not spans: return 0
    spans = sorted(spans)
    merged = []
    cs, ce = spans[0]
    for s,e in spans[1:]:
        if s <= ce: ce = max(ce, e)
        else: merged.append((cs, ce)); cs, ce = s, e
    merged.append((cs, ce))
    return sum(e-s for s,e in merged)

def _load_pages(fixed_jsonl):
    pages = {}
    with open(fixed_jsonl, encoding="utf-8") as f:
        for line in f:
            o = json.loads(line)
            pages[o["page_index"]] = o["text"]
    return pages

def _load_chunks(chunks_jsonl):
    chunks = []
    with open(chunks_jsonl, encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

def coverage_from_chunks_v2(fixed_jsonl, chunks_jsonl, para_min_len=80):
    pages  = _load_pages(fixed_jsonl)
    chunks = _load_chunks(chunks_jsonl)

    # page -> concat des textes de chunks qui déclarent cette page
    by_page_concat = {p:"" for p in pages}
    for c in chunks:
        txt = c["text"]
        for p in c.get("pages", []):
            if p in by_page_concat:
                by_page_concat[p] += ("\n\n" + txt)

    report = []
    for p, page_text in pages.items():
        page_paras = _paragraphs(page_text, min_len=para_min_len)
        spans = []
        haystack = by_page_concat.get(p, "")
        for para in page_paras:
            if para and para in haystack:
                start = page_text.find(para)
                if start != -1:
                    spans.append((start, start + len(para)))
        covered = _union_len(spans)
        cov_pct = 0.0 if len(page_text)==0 else (covered/len(page_text))*100.0
        report.append({
            "page_index": p,
            "page_len": len(page_text),
            "covered_len": covered,
            "coverage_pct": round(cov_pct, 3),
            "paras_count": len(page_paras),
            "paras_matched": len(spans)
        })
    return sorted(report, key=lambda x: x["page_index"])


In [75]:
fixed  = "/content/0018_LeveragingSecurity_DK_20160111_fixed.jsonl"
chunks = "/content/0018_LeveragingSecurity_DK_20160111_chunks.jsonl"

rep = coverage_from_chunks_v2(fixed, chunks, para_min_len=80)
for r in rep:
    print(r)


{'page_index': 0, 'page_len': 1837, 'covered_len': 1837, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 1, 'page_len': 3207, 'covered_len': 3207, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 2, 'page_len': 3050, 'covered_len': 3050, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 3, 'page_len': 1251, 'covered_len': 1251, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 4, 'page_len': 2503, 'covered_len': 2503, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 5, 'page_len': 1917, 'covered_len': 1917, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 6, 'page_len': 2551, 'covered_len': 2551, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 7, 'page_len': 1217, 'covered_len': 1217, 'coverage_pct': 100.0, 'paras_count': 1, 'paras_matched': 1}
{'page_index': 8, 'page_len': 4128, 'covered_len': 4128,

**Étape suivante (RAG texte only) — ingestion VectorDB (FAISS)**

In [76]:
!pip -q install faiss-cpu sentence-transformers

import json, numpy as np, faiss
from sentence_transformers import SentenceTransformer

# 1) modèle multilingue orienté recherche
model = SentenceTransformer("intfloat/multilingual-e5-base")

# 2) charger tes chunks
chunks_jsonl = "/content/0018_LeveragingSecurity_DK_20160111_chunks.jsonl"
texts, metas = [], []
with open(chunks_jsonl, encoding="utf-8") as f:
    for i, line in enumerate(f):
        o = json.loads(line)
        txt = o["text"].strip()
        if not txt: continue
        # IMPORTANT: préfixe passage
        texts.append("passage: " + txt)
        metas.append({"id": i, "pages": o.get("pages", [])})

# 3) embeddings normalisés + index FAISS (cosine via Inner Product)
emb = model.encode(texts, normalize_embeddings=True)  # (N, d)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb.astype("float32"))

faiss.write_index(index, "/content/security.e5.index")
with open("/content/security.e5.meta.json","w",encoding="utf-8") as f:
    json.dump({"metas": metas}, f, ensure_ascii=False, indent=2)

print("OK index E5:", len(texts))


OK index E5: 4


**Recherche sémantique (top-k)**

In [77]:
import numpy as np, faiss, json
from sentence_transformers import SentenceTransformer

index = faiss.read_index("/content/security.e5.index")
with open("/content/security.e5.meta.json", encoding="utf-8") as f:
    METAS = json.load(f)["metas"]
model = SentenceTransformer("intfloat/multilingual-e5-base")

MINF = -3.4028235e+38  # min float32

def search_e5(query: str, k: int = 8, threshold: float = 0.0):
    k = min(k, index.ntotal)  # borne dure
    q = model.encode(["query: " + query], normalize_embeddings=True).astype("float32")
    D, I = index.search(q, k)

    out = []
    for rank, (idx, score) in enumerate(zip(I[0], D[0]), 1):
        if idx == -1 or score <= (MINF/2):  # garde que les vrais hits
            continue
        row = {"rank": rank, "score": float(score), **METAS[idx]}
        if row["score"] >= threshold:
            out.append(row)
    return out


In [78]:
hits = search_e5("procédure d’installation sécurisée du composant X", k=8, threshold=0.5)
for h in hits:
    print(h)


{'rank': 1, 'score': 0.7873495817184448, 'id': 0, 'pages': [0, 1, 2, 3]}
{'rank': 2, 'score': 0.7849428057670593, 'id': 3, 'pages': [10, 11, 12, 13]}
{'rank': 3, 'score': 0.7848808765411377, 'id': 2, 'pages': [7, 8, 9]}
{'rank': 4, 'score': 0.7676029205322266, 'id': 1, 'pages': [4, 5, 6]}


In [79]:
def pretty_hits(hits, max_items=8):
    for h in hits[:max_items]:
        print(f"[{h['rank']}] score={h['score']:.3f}  pages={h['pages']}  id={h['id']}")

def filter_hits(hits, threshold=0.6, k=4):
    kept = [h for h in hits if h["score"] >= threshold]
    return kept[:k]

pretty_hits(hits)
top_hits = filter_hits(hits, threshold=0.6, k=3)
print("\nKept:", len(top_hits))
pretty_hits(top_hits)


[1] score=0.787  pages=[0, 1, 2, 3]  id=0
[2] score=0.785  pages=[10, 11, 12, 13]  id=3
[3] score=0.785  pages=[7, 8, 9]  id=2
[4] score=0.768  pages=[4, 5, 6]  id=1

Kept: 3
[1] score=0.787  pages=[0, 1, 2, 3]  id=0
[2] score=0.785  pages=[10, 11, 12, 13]  id=3
[3] score=0.785  pages=[7, 8, 9]  id=2


In [80]:
import json, re

chunks_jsonl = "/content/0018_LeveragingSecurity_DK_20160111_chunks.jsonl"  # adapte si besoin

def get_chunk_text(chunk_id: int, path=chunks_jsonl) -> str:
    with open(path, encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i == chunk_id:
                return json.loads(line)["text"]
    return ""

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def extractive_answer(query: str, hits: list, max_chars=1200, top_k=3):
    kept = hits[:top_k]
    snippets = []
    used = 0
    for h in kept:
        txt = get_chunk_text(h["id"]).strip()
        if not txt:
            continue
        # sélection naïve: phrases qui contiennent des mots de la requête
        q_terms = {w.lower() for w in re.findall(r"\w+", query) if len(w) >= 3}
        sentences = _SENT_SPLIT.split(txt)
        scored = []
        for s in sentences:
            terms = {w.lower() for w in re.findall(r"\w+", s)}
            overlap = len(q_terms & terms)
            if overlap:
                scored.append((overlap, s))
        scored.sort(key=lambda x: x[0], reverse=True)
        picked = " ".join([s for _, s in scored[:5]]) or " ".join(sentences[:3])

        room = max_chars - used
        if room <= 0:
            break
        piece = picked[:room]
        snippets.append(f"- {piece}")
        used += len(piece)

    answer = "\n".join(snippets) if snippets else "(pas de phrase très spécifique trouvée; voir extraits ci-dessous)"
    sources = [f"pages {h['pages']} (score {h['score']:.3f}, id {h['id']})" for h in kept]
    return answer, sources

# exemple avec ta requête
q = "procédure d’installation sécurisée du composant X"
answer, sources = extractive_answer(q, top_hits, max_chars=1200, top_k=3)

print("=== RÉPONSE (extraction) ===\n", answer[:1200])
print("\n=== SOURCES ===")
for s in sources: print("•", s)


=== RÉPONSE (extraction) ===
 - WHITE PAPER Leveraging Security - Using the SEL RTAC's Built-In Security Features Darrin Kite INTRODUCTION Cyberthreats to critical infrastructure represent a growing and persistent risk to a nation's security and prosperity. The SEL Real-Time Automation Controller (RTAC) product platform serves as the information hub for substations in electric utilities as well as many other critical industries. It provides essential services, such as data aggregation, logic processing, oscillography, event report collection, and secure engineering access.
- 11 SECTION 7: SECURITY AUDITING - EVENT MONITORING AND REPORTING Tracking and archiving events is an important function provided by a data concentrator. In the event that an actual or suspected unauthorized intrusion is detected, a device's logs provide an essential piece of forensic evidence. In the SEL RTAC, the SOE log can store up to 30,000 log items.
- 8 Figure 7 An ACSELERATOR RTAC Project Configured for Secu