In [None]:
!pip -q install pymupdf pdfminer.six pytesseract pillow
!apt -q install -y tesseract-ocr >/dev/null






In [None]:
# Colab:
!pip -q install pymupdf==1.24.9


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [30]:
# === AUTO LIB : Scan dossier -> PDF->JSONL->FIXED + index ===
!pip -q install pymupdf==1.24.9
import os, re, json, hashlib, unicodedata, time, fitz
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple

# ---------- Normalisation forte ----------
ZW = {"\u200B","\u200C","\u200D","\u2060","\uFEFF"}
LIG={"ﬀ":"ff","ﬁ":"fi","ﬂ":"fl","ﬃ":"ffi","ﬄ":"ffl","ﬅ":"ft","ﬆ":"st"}
QUO={"“":'"',"”":'"',"„":'"',"«":'"',"»":'"',"‘":"'", "’":"'", "‚":"'"}
DASH={"‒":"-","‐":"-","–":"-","—":"-","−":"-","\u2010":"-","\u2011":"-","\u2012":"-","\u2212":"-"}
def norm_strong(s:str)->str:
    s = unicodedata.normalize("NFKC", s)
    for z in ZW: s = s.replace(z,"")
    s = s.replace("\xa0"," ")
    for m in (LIG,QUO,DASH):
        for k,v in m.items(): s = s.replace(k,v)
    s = s.replace("…","...")
    s = re.sub(r"\s+([,.;:!?%)\]}])", r"\1", s)
    s = re.sub(r"([([{])\s+", r"\1", s)
    s = re.sub(r"\s+%","%", s)
    s = re.sub(r"[ \t]+"," ", s).strip()
    return s

def _sha(s:str)->str: return hashlib.sha256(s.encode("utf-8")).hexdigest()

# ---------- Extraction "lossless" ----------
@dataclass
class Word:
    text: str; x0: float; y0: float; x1: float; y1: float; block: int; line: int; wno: int

def get_words_lossless(page):
    out, seen = [], set()
    for (x0,y0,x1,y1,txt,b,l,w) in page.get_text("words"):
        if not txt: continue
        key=(round(x0,2),round(y0,2),round(x1,2),round(y1,2),txt,b,l,w)
        if key in seen: continue
        seen.add(key); out.append(Word(txt,x0,y0,x1,y1,b,l,w))
    out.sort(key=lambda w:(w.block,w.line,w.wno,w.x0,w.y0))
    return out

PUNCT_STICKY={",",".",";",";",":","!","?",")","]","}","%"}
PUNCT_OPEN=set(["(","[","{","«","“","‘"])
def join_words_into_text(words: List[Word]) -> str:
    res=[]; prev=None
    for w in words:
        t=w.text
        if not res: res.append(t)
        else:
            if t in PUNCT_STICKY: res.append(t)
            elif prev and prev[-1] in PUNCT_OPEN: res.append(t)
            else: res.append(" "+t)
        prev=t
    return norm_strong("".join(res))

def page_to_json(page_idx, page):
    words = get_words_lossless(page)
    text  = join_words_into_text(words)
    return {
        "page_index": page_idx,
        "size": {"width": page.rect.width, "height": page.rect.height},
        "words": [asdict(w) for w in words],
        "text": text,
        "stats": {"n_words": len(words), "n_chars": len(text), "sha256": _sha(text)}
    }

# ---------- Conversion + manifest ----------
def pdf_to_jsonl(pdf_path:str, out_jsonl:str, manifest_path:str):
    os.makedirs(os.path.dirname(out_jsonl) or ".", exist_ok=True)
    doc = fitz.open(pdf_path)
    report=[]; ours_tot=0; ref_tot=0; ok=0
    with open(out_jsonl,"w",encoding="utf-8") as f:
        for i in range(len(doc)):
            page=doc[i]
            pj=page_to_json(i,page)
            ours = pj["text"]
            ref  = norm_strong(page.get_text("text").replace("\r","\n").replace("\n"," "))
            same = _sha(ours)==_sha(ref)
            if same: ok+=1
            ours_tot += len(ours); ref_tot += len(ref)
            f.write(json.dumps(pj, ensure_ascii=False)+"\n")
            pref=len(os.path.commonprefix([ours,ref])); cov=round(pref/max(1,len(ref)),6)
            report.append({"page_index":i,"same_len":len(ours)==len(ref),"same_sha":same,
                           "coverage_char_ratio":cov,"n_chars_ours":len(ours),"n_chars_ref":len(ref)})
    doc.close()
    manifest={"pdf_path":pdf_path,"n_pages":len(report),"pages_ok_same_sha":ok,
              "global_chars_ours":ours_tot,"global_chars_ref":ref_tot,
              "global_same_charcount":ours_tot==ref_tot,"pages_report":report}
    with open(manifest_path,"w",encoding="utf-8") as mf:
        json.dump(manifest, mf, ensure_ascii=False, indent=2)

# ---------- Fix pages douteuses (fallback contrôlé) ----------
def make_authoritative_jsonl(pdf_path:str, in_jsonl:str, out_jsonl:str, cov_threshold=0.98):
    doc=fitz.open(pdf_path)
    with open(in_jsonl,encoding="utf-8") as fi, open(out_jsonl,"w",encoding="utf-8") as fo:
        for line in fi:
            o=json.loads(line)
            p=o["page_index"]
            ours=norm_strong(o["text"])
            ref =norm_strong(doc[p].get_text("text").replace("\r","\n").replace("\n"," "))
            if _sha(ours)!=_sha(ref):
                pref=len(os.path.commonprefix([ours,ref])); cov=pref/max(1,len(ref))
                if cov<cov_threshold:
                    o["text"]=ref
                    o.setdefault("debug",{})["fallback_applied"]=True
                    o["debug"]["coverage_before"]=round(cov,6)
            fo.write(json.dumps(o, ensure_ascii=False)+"\n")
    doc.close()

# ---------- Utils scan ----------
def list_pdfs(input_dir:str)->List[str]:
    pdfs=[]
    for root,_,files in os.walk(input_dir):
        for fn in files:
            if fn.lower().endswith(".pdf"):
                pdfs.append(os.path.join(root, fn))
    return sorted(pdfs)

def relpath_without_ext(path:str, base_dir:str)->str:
    rel = os.path.relpath(path, base_dir)
    return os.path.splitext(rel)[0]  # ex: "sub/a/b/c"

# ---------- Orchestrateur full-auto ----------
def process_pdf(pdf_path:str, input_dir:str, output_dir:str, cov_threshold=0.98)->Dict:
    rel = relpath_without_ext(pdf_path, input_dir)
    out_dir = os.path.join(output_dir, os.path.dirname(rel))
    base    = os.path.basename(rel)
    os.makedirs(out_dir, exist_ok=True)

    out_jsonl   = os.path.join(out_dir, f"{base}.jsonl")
    out_manifest= os.path.join(out_dir, f"{base}_manifest.json")
    out_fixed   = os.path.join(out_dir, f"{base}_fixed.jsonl")

    # skip si déjà à jour (fixed plus récent que le pdf)
    if os.path.exists(out_fixed) and os.path.getmtime(out_fixed) >= os.path.getmtime(pdf_path):
        # lire manifest pour stats si dispo, sinon fabriquer minimal
        manifest = {}
        if os.path.exists(out_manifest):
            with open(out_manifest, encoding="utf-8") as mf: manifest = json.load(mf)
        return {"pdf": pdf_path, "status":"skipped(up-to-date)", "outputs":{
            "jsonl": out_jsonl, "fixed_jsonl": out_fixed, "manifest": out_manifest
        }, "manifest": manifest}

    # étape 1: pdf -> jsonl + manifest
    pdf_to_jsonl(pdf_path, out_jsonl, out_manifest)
    # étape 2: autoritative fixed
    make_authoritative_jsonl(pdf_path, out_jsonl, out_fixed, cov_threshold=cov_threshold)

    # résumer pages corrigées
    fixed_pages=[]; total_pages=0
    with open(out_fixed, encoding="utf-8") as f:
        for line in f:
            o=json.loads(line); total_pages+=1
            if o.get("debug",{}).get("fallback_applied"): fixed_pages.append(o["page_index"])

    return {"pdf": pdf_path, "status":"processed", "pages_total": total_pages,
            "pages_fixed": fixed_pages, "outputs":{
                "jsonl": out_jsonl, "fixed_jsonl": out_fixed, "manifest": out_manifest
            }}

def process_all(input_dir:str="/content/in", output_dir:str="/content/out", cov_threshold=0.98)->Dict:
    os.makedirs(output_dir, exist_ok=True)
    pdfs = list_pdfs(input_dir)
    summary = {"input_dir": input_dir, "output_dir": output_dir, "count": len(pdfs), "files": []}
    for i,p in enumerate(pdfs,1):
        print(f"[{i}/{len(pdfs)}] {p}")
        try:
            res = process_pdf(p, input_dir, output_dir, cov_threshold)
        except Exception as e:
            res = {"pdf": p, "status": f"error: {e}"}
        summary["files"].append(res)

    # index.json pour tracer tout
    index_path = os.path.join(output_dir, "index.json")
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)
    print("OK index:", index_path)
    return summary


In [31]:
# crée les dossiers si besoin
os.makedirs("/content/", exist_ok=True)
os.makedirs("/content/", exist_ok=True)

# 1 commande = scan + conversion + fix + index
summary = process_all(input_dir="/content/", output_dir="/content/", cov_threshold=0.98)

# aperçu rapide
print("Trouvés:", summary["count"], "PDFs")
for f in summary["files"][:10]:
    print(f["status"], "->", f.get("outputs",{}).get("fixed_jsonl"))


[1/4] /content/0018_LeveragingSecurity_DK_20160111.pdf
[2/4] /content/0019_AutomatingProt_DK_20160111.pdf
[3/4] /content/511399-UEN_CSDG_670_2p2.pdf
[4/4] /content/SIP5_Security_V10.00_Manual_C081-G_en.pdf
OK index: /content/index.json
Trouvés: 4 PDFs
processed -> /content/0018_LeveragingSecurity_DK_20160111_fixed.jsonl
processed -> /content/0019_AutomatingProt_DK_20160111_fixed.jsonl
processed -> /content/511399-UEN_CSDG_670_2p2_fixed.jsonl
processed -> /content/SIP5_Security_V10.00_Manual_C081-G_en_fixed.jsonl
