In [2]:
!pip install pdfplumber pymupdf pdf2image pytesseract pillow tqdm
# Ubuntu/Colab cần thêm Tesseract + poppler (để render PDF->image):
# apt-get -y update && apt-get -y install tesseract-ocr poppler-utils


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m


#Not important cells

In [None]:
import os, re, json, shutil, math
from pathlib import Path
from typing import Tuple, List, Dict, Any

import pdfplumber
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from tqdm import tqdm

# ==== cấu hình ====
INPUT_DIR = "/content/drive/MyDrive/Project-AI/Data/pdf-data"   # <--- sửa đường dẫn thư mục PDF của bạn
OUT_ROOT  = "/content/drive/MyDrive/Project-AI/Data/data/cases"
DPI       = 300           # render trang cho OCR/cắt ảnh
LANG      = "eng"         # ngôn ngữ OCR
SAVE_SOURCE_COPY = False  # True nếu muốn copy file pdf sang case dir

# ==== tiện ích ====
def safe_slug(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9._-]+", "_", s).strip("_")[:150]

def parse_case_id_title(pdf_path: Path) -> Tuple[str, str]:
    """
    Ép '001---Title.pdf' -> ('001', 'Title');
    nếu không khớp, tạo case_id theo index và dùng tên file làm title.
    """
    name = pdf_path.stem
    m = re.match(r"^(\d+)\s*---\s*(.+)$", name)
    if m:
        return m.group(1).zfill(3), m.group(2)
    # fallback
    return None, name

def ensure_dirs(case_dir: Path):
    (case_dir / "pages").mkdir(parents=True, exist_ok=True)
    (case_dir / "images").mkdir(parents=True, exist_ok=True)

def has_text_layer(pdf_path: Path, sample_pages: int = 3) -> bool:
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for p in pdf.pages[:sample_pages]:
                t = p.extract_text() or ""
                if t.strip():
                    return True
    except Exception:
        pass
    return False

# ==== trích text ====
def extract_text_via_text_layer(pdf_path: Path) -> str:
    texts = []
    with pdfplumber.open(pdf_path) as pdf:
        for p in pdf.pages:
            t = p.extract_text() or ""
            texts.append(t.strip())
    return ("\n\n--- PAGE BREAK ---\n\n").join(texts).strip()

def extract_text_via_ocr(pdf_path: Path, pages_dir: Path, dpi: int = 300, lang: str = "eng") -> str:
    pages_img: List[Image.Image] = convert_from_path(str(pdf_path), dpi=dpi)
    out = []
    for i, img in enumerate(pages_img, 1):
        # lưu page png
        page_png = pages_dir / f"{pages_dir.parent.name}_page_{i}.png"
        img.save(page_png)

        # OCR
        gray = img.convert("L")
        txt = pytesseract.image_to_string(gray, lang=lang, config="--psm 6")
        out.append((txt or "").strip())
    return ("\n\n--- PAGE BREAK ---\n\n").join(out).strip()

# ==== trích images (figure) bằng PyMuPDF ====
def extract_page_images(pdf_path: Path, images_dir: Path, pages_dir: Path, dpi: int = 300) -> List[Dict[str, Any]]:
    """
    Cắt các block image theo bbox -> file PNG; đồng thời render PNG full trang để đối chiếu.
    Trả về list metadata cho từng ảnh.
    """
    meta_images = []
    doc = fitz.open(str(pdf_path))
    for pno in range(len(doc)):
        page = doc[pno]

        # render và lưu ảnh trang (nếu chưa có)
        page_png = pages_dir / f"{pages_dir.parent.name}_page_{pno+1}.png"
        if not page_png.exists():
            pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))  # scale theo DPI
            pix.save(str(page_png))

        # lấy blocks, tìm block là image
        d = page.get_text("dict")  # blocks: {'type':0=text, 1=image}
        for bi, block in enumerate(d.get("blocks", []), start=1):
            if block.get("type") != 1:
                continue
            x0, y0, x1, y1 = block["bbox"]
            rect = fitz.Rect(x0, y0, x1, y1)

            # crop ảnh theo bbox
            pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), clip=rect, alpha=False)
            out_name = f"{images_dir.parent.name}_p{pno+1}_fig_{bi}.png"
            out_path = images_dir / out_name
            pix.save(str(out_path))

            meta_images.append({
                "page_number": pno+1,
                "figure_index": bi,
                "bbox": [x0, y0, x1, y1],
                "path": str(out_path),
            })
    doc.close()
    return meta_images

# ==== xử lý 1 file PDF -> 1 folder case ====
def process_pdf(pdf_path: Path, out_root: Path, idx_fallback: int) -> Dict[str, Any]:
    case_id, title = parse_case_id_title(pdf_path)
    if case_id is None:
        case_id = str(idx_fallback).zfill(3)
    case_dir = out_root / case_id
    ensure_dirs(case_dir)

    # copy source.pdf (tuỳ chọn)
    if SAVE_SOURCE_COPY:
        shutil.copy2(pdf_path, case_dir / "source.pdf")
    else:
        # vẫn ghi đường dẫn nguồn trong manifest
        pass

    # 1) Text: ưu tiên text layer -> fallback OCR
    if has_text_layer(pdf_path):
        raw = extract_text_via_text_layer(pdf_path)
        # cũng render trang để bạn có file png theo dõi (không OCR)
        try:
            _ = convert_from_path(str(pdf_path), dpi=DPI)
            for i, img in enumerate(_, 1):
                img.save(case_dir / "pages" / f"{case_dir.name}_page_{i}.png")
        except Exception:
            pass
    else:
        raw = extract_text_via_ocr(pdf_path, case_dir / "pages", dpi=DPI, lang=LANG)

    (case_dir / "raw.txt").write_text(raw, encoding="utf-8")

    # 2) Images: crop theo bbox (figure) trên mỗi trang
    imgs_meta = extract_page_images(pdf_path, case_dir / "images", case_dir / "pages", dpi=DPI)

    # 3) Manifest
    manifest = {
        "case_id": int(case_id),
        "case_title": title,
        "source_pdf": str(pdf_path.resolve()),
        "n_pages": len(list((case_dir / "pages").glob("*.png"))),
        "n_images": len(imgs_meta),
        "images": imgs_meta
    }
    (case_dir / "manifest.json").write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
    return manifest

# ==== chạy hàng loạt ====
def process_folder(input_dir: str, out_root: str):
    in_dir = Path(input_dir)
    out_dir = Path(out_root)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdfs = sorted(in_dir.glob("*.pdf"))
    manifests = []

    for i, pdf in enumerate(tqdm(pdfs, desc="Processing PDFs"), start=1):
        try:
            m = process_pdf(pdf, out_dir, idx_fallback=i)
            manifests.append(m)
        except Exception as e:
            print(f"[WARN] Failed {pdf.name}: {e}")

    # index file tổng
    (out_dir / "index.json").write_text(json.dumps(manifests, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[DONE] {len(manifests)}/{len(pdfs)} PDFs processed. Output -> {out_dir}")

# === RUN ===
process_folder(INPUT_DIR, OUT_ROOT)



Processing PDFs: 100%|██████████| 93/93 [02:46<00:00,  1.79s/it]

[DONE] 93/93 PDFs processed. Output -> /content/drive/MyDrive/Project-AI/Data/data/cases





In [None]:
# pip install pdfplumber pymupdf regex
import re, pdfplumber, fitz, os
from pathlib import Path

def extract_text_pdfplumber(pdf_path, x_tol=1.0, y_tol=3.0):
    out = []
    with pdfplumber.open(pdf_path) as pdf:
        for p in pdf.pages:
            t = p.extract_text(x_tolerance=x_tol, y_tolerance=y_tol) or ""
            out.append(t)
    return "\n\n--- PAGE BREAK ---\n\n".join(out)

def extract_text_pymupdf_blocks(pdf_path):
    doc = fitz.open(pdf_path)
    paras = []
    for page in doc:
        blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES|fitz.TEXT_PRESERVE_WHITESPACE)
        # blocks: (x0,y0,x1,y1,text,_,_,_)
        blocks = sorted(blocks, key=lambda b: (round(b[1],1), round(b[0],1)))
        for b in blocks:
            txt = (b[4] or "").strip()
            if txt:
                paras.append(txt)
    doc.close()
    return "\n\n".join(paras)

def clean_text(s: str) -> str:
    # loại rác thường gặp
    s = re.sub(r"\(cid:\d+\)", " ", s)
    s = re.sub(r"\bCHAPTER\s+\d+\b.*?\n", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"\b(Fig|Figure)\.\s*\d+(\.\d+)?[^\n]*\n", "\n", s)
    s = re.sub(r"\n\s*Table\s*\d+(\.\d+)?[^\n]*\n", "\n", s, flags=re.IGNORECASE)

    # bỏ hyphen ngắt dòng và nối
    s = s.replace("-\n", "")
    # đổi ngắt dòng đơn trong câu thành khoảng trắng
    s = re.sub(r"(?<!\n)\n(?!\n)", " ", s)
    # giữ lại ngắt đoạn khi có 2+ dòng trống
    s = re.sub(r"\n{3,}", "\n\n", s)

    # chèn khoảng trắng bị mất do PDF
    s = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", s)            # camelCase → camel Case
    s = re.sub(r"(?<=[A-Za-z])(?=[0-9])", " ", s)         # word123 → word 123
    s = re.sub(r"(?<=[0-9])(?=[A-Za-z])", " ", s)         # 123mg → 123 mg
    s = re.sub(r"(?<=[A-Za-z])(?=[°])", " ", s)           # C° → C °
    s = re.sub(r"(?<=\d)(?=%)", " ", s)                   # 30% → 30 %
    s = re.sub(r"(?<=\d)(?=[A-Za-z/])", " ", s)           # 6200U/L → 6200 U/L
    s = re.sub(r"(?<=[A-Za-z])(?=\()", " ", s)            # word( → word (
    s = re.sub(r"\s{2,}", " ", s)

    # dấu chấm dính chữ: Lima.Eightdaysprior → Lima. Eight days prior
    s = re.sub(r"(?<=[a-zA-Z0-9])\.(?=[A-Za-z])", ". ", s)

    # chuẩn hóa trang/break
    s = re.sub(r"\s*--- PAGE BREAK ---\s*", "\n\n", s)

    return s.strip()

def smart_extract(pdf_path: str) -> str:
    pdf_path = str(pdf_path)
    try:
        raw = extract_text_pdfplumber(pdf_path, x_tol=1.2, y_tol=3.0)
        if len(raw.strip()) < 50:  # quá ít chữ -> thử PyMuPDF
            raise ValueError("too short")
    except Exception:
        raw = extract_text_pymupdf_blocks(pdf_path)
    return clean_text(raw)

def batch_extract(input_dir: str, out_dir: str):
    os.makedirs(out_dir, exist_ok=True)
    for pdf in sorted(Path(input_dir).glob("*.pdf")):
        case_id = pdf.stem.split("---")[0].zfill(3) if "---" in pdf.stem else pdf.stem
        text = smart_extract(pdf)
        out_path = Path(out_dir) / f"{case_id}_raw.txt"
        out_path.write_text(text, encoding="utf-8")
        print("Saved:", out_path)

# ví dụ chạy
batch_extract("/content/drive/MyDrive/Project-AI/Data/pdf-data", "/content/drive/MyDrive/Project-AI/Data/raw_extract_data_2")


Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/001_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/010_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/011_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/012_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/013_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/014_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/015_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/016_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/017_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/019_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/002_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/020_raw.txt
Saved: /content/drive/MyDrive/Project-AI/Data/raw_extract_data_2/021_raw.txt

In [None]:
!apt-get -y update
!apt-get -y install poppler-utils
# nếu dùng Windows/Mac local: cài Poppler rồi đặt đường dẫn vào poppler_path trong convert_from_path


0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Waiting for headers] [Wai                                                                               Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.2 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [1

#Extract data to schema 11 fields jsons using vison llm

In [None]:
from pathlib import Path

OUT_ROOT = "/content/drive/MyDrive/Project-AI/Data/extracted_data_2"  # ví dụ
DEBUG_LOG_DIR = Path(OUT_ROOT) / "_vlm_logs"

# Tạo thư mục output + log ngay từ đầu
Path(OUT_ROOT).mkdir(parents=True, exist_ok=True)
DEBUG_LOG_DIR.mkdir(parents=True, exist_ok=True)

def _save_debug(case_id: str, kind: str, content: str):
    try:
        DEBUG_LOG_DIR.mkdir(parents=True, exist_ok=True)  # đảm bảo tồn tại
        p = DEBUG_LOG_DIR / f"{str(case_id)}.{kind}.txt"
        p.write_text(content if isinstance(content, str) else str(content), encoding="utf-8")
    except Exception as e:
        # Không để việc ghi log làm hỏng pipeline
        print(f"[debug-warn] cannot write log {case_id}.{kind}: {e}")


def _extract_json_loose(text: str, json_keys: list[str]) -> dict:
    import re, json
    if not text:
        raise ValueError("empty VLM output")

    # 1) nếu model bọc ```json ... ```
    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.S)
    if m:
        text = m.group(1)

    # 2) lấy khối JSON “lớn nhất” (từ { … } cuối cùng)
    if "{" in text and "}" in text:
        # tham lam rồi lùi về ngoặc đóng cuối
        m2 = re.search(r"\{.*\}", text, flags=re.S)
        if m2:
            cand = m2.group(0)
        else:
            # fallback lấy khối ở cuối
            m3 = re.findall(r"\{.*\}", text, flags=re.S)
            cand = m3[-1] if m3 else text
    else:
        cand = text

    obj = json.loads(cand)

    # 3) nắn về đúng 11 key (thiếu thì thêm null, thừa thì bỏ)
    for k in json_keys:
        obj.setdefault(k, None)
    obj = {k: (obj.get(k) if isinstance(obj.get(k), (str, type(None))) else str(obj.get(k)))
           for k in json_keys}
    return obj


In [None]:
# -*- coding: utf-8 -*-
import os, json, base64, io, re
from pathlib import Path
from typing import List, Dict, Any
from dataclasses import dataclass
from tqdm import tqdm

# PDF/IMG libs
from pdf2image import convert_from_path   # cần Poppler khi chạy local (xem ghi chú)
from PIL import Image
import fitz  # PyMuPDF

# ================== CẤU HÌNH CHÍNH ==================
# Thư mục PDF gốc và nơi lưu kết quả
INPUT_DIR = "/content/drive/MyDrive/Project-AI/Data/pdf-data"         # <--- ĐIỀN THƯ MỤC PDF CỦA BẠN
# OUT_ROOT  = "/content/drive/MyDrive/Project-AI/Data/extracted_data"

DPI_PAGES = 300
MAX_PAGES_FOR_VLM = 3
RETRIES = 2

# LM Studio (OpenAI-compatible) qua ngrok
NGROK_BASE_URL = "https://tammara-stalkless-portentously.ngrok-free.dev"  # <--- ĐỔI THEO NGROK CỦA BẠN
OPENAI_BASE_URL = f"{NGROK_BASE_URL}/v1"
OPENAI_API_KEY  = "lm-studio"  # chuỗi giả – bắt buộc có, không cần bí mật

# Model Vision đang chạy trong LM Studio (đúng như ảnh bạn chụp)
MODEL_NAME = "qwen2.5-vl-7b-instruct"

# 11 khoá JSON cần trích
JSON_KEYS = [
    "patient_information","chief_complaint","history_of_present_illness",
    "exposure_and_epidemiology","vitals","physical_exam","labs_and_diagnostics",
    "differential_diagnosis","management_and_clinical_course","final_diagnosis","disease_name_short"
]

# ================== PROMPT ==================
TROPID_SYSTEM_PROMPT = r"""# TropID Big-Chunk Case → JSON Extractor (Prompt)
## System
You are **TropID-Extractor**, an expert clinical information extractor for **tropical & infectious diseases**.
Your task: **read a free-text clinical case** and return a **single JSON object** where **each section is one coherent full-text block** (no bulletizing into tiny subfields).
If a section is not present, set it to `null`. **Do not guess.**
**Output ONLY valid JSON** — no preamble, no commentary.
### Formatting & Safety Rules
- **One JSON object only.**
- **Full-text blocks:** Each field below must be a **cohesive paragraph** (or short multi-sentence block) stitched from the case text; paraphrase minimally, preserve clinical meaning, and **do not invent** missing details.
- **Attribution discipline:** Prefer exact phrases from the source for key facts (fever pattern, exposures, test names, titers) but keep the prose readable.
- **Units & names:** Keep units and proper names as written (°C/°F, NS1, thick smear, RDT, species, titers/CT values).
- **Privacy:** Exclude any direct identifiers if present.
- **Uncertainty:** If the case explicitly says something is “unclear/unknown,” include that wording.
- **Final diagnosis:** Write a concise paragraph that **states the diagnosis, the causative agent if given, and the evidence** (labs/imaging/epidemiology/response to therapy).
- **Disease name (short):** After `final_diagnosis`, fill `disease_name_short` with the **best disease name only** (e.g., “Dengue fever”, “Falciparum malaria”, “Scrub typhus”).
---
Return **only** this JSON schema (exact keys, same order):
```json
{
  "patient_information": null,
  "chief_complaint": null,
  "history_of_present_illness": null,
  "exposure_and_epidemiology": null,
  "vitals": null,
  "physical_exam": null,
  "labs_and_diagnostics": null,
  "differential_diagnosis": null,
  "management_and_clinical_course": null,
  "final_diagnosis": null,
  "disease_name_short": null
}
```
### Field guidance (concise)
- **patient_information**: age/sex; relevant comorbidities/immunosuppression; vaccination/allergy info if stated.
- **chief_complaint**: one-line problem + duration.
- **history_of_present_illness**: timeline, key symptoms, pertinent negatives, severity pattern.
- **exposure_and_epidemiology**: residence/travel (place/setting, dates if present), vectors (mosquito/tick), animals, water/food risks, contacts, season/outbreak context, occupation.
- **vitals**: all reported vital signs as text (fever values, BP, HR, RR, SpO₂).
- **physical_exam**: salient systems (skin, HEENT, chest, abdo, neuro, lymph, etc.).
- **labs_and_diagnostics**: CBC trends, key chem/coag, inflammatory markers, microbiology/serology/PCR (assay + result + titer/CT if present), malaria tests, imaging highlights.
- **differential_diagnosis**: succinct narrative of the main alternatives considered, with one-sentence justification for/against each.
- **management_and_clinical_course**: antimicrobials (drug/dose if given), supportive care, procedures; response, complications, outcome.
- **final_diagnosis**: 3–5 sentences: explicit disease name ± causative agent, confirmation method (e.g., NS1+, thick smear species, PCR/serology), and why alternatives were ruled out.
- **disease_name_short**: the disease name only (no agent, no method).
---

---
## User
Extract the following fields as **full-text blocks** from this clinical case:

"""

#==========================================================
import pdfplumber, re

def clean_text_for_prompt(s: str, max_chars: int = 8000) -> str:
    # lược bỏ rác phổ biến để ngắn gọn hơn khi đưa vào VLM
    s = re.sub(r"\(cid:\d+\)", " ", s)
    s = re.sub(r"\b(Fig|Figure)\.\s*\d+(\.\d+)?[^\n]*\n", "\n", s)
    s = re.sub(r"\n\s*Table\s*\d+(\.\d+)?[^\n]*\n", "\n", s, flags=re.IGNORECASE)
    s = s.replace("-\n", "")
    s = re.sub(r"(?<!\n)\n(?!\n)", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    s = s.strip()
    # cắt ngắn để không vượt context
    if len(s) > max_chars:
        s = s[:max_chars]
    return s

def extract_text_all_pages(pdf_path: str, max_chars: int = 8000) -> str:
    """
    Nhanh-gọn: ưu tiên text layer bằng pdfplumber; nếu lỗi/ít chữ, fallback sang PyMuPDF blocks.
    Trả về chuỗi đã clean và cắt ngắn để nhét vào prompt cho VLM.
    """
    pdf_path = str(pdf_path)
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = []
            for p in pdf.pages:
                t = p.extract_text() or ""
                if t.strip():
                    pages.append(t.strip())
            text = "\n\n--- PAGE BREAK ---\n\n".join(pages)
    except Exception:
        pass

    if len(text.strip()) < 50:
        import fitz
        doc = fitz.open(pdf_path)
        paras = []
        for page in doc:
            blocks = page.get_text("blocks", flags=fitz.TEXT_PRESERVE_LIGATURES|fitz.TEXT_PRESERVE_WHITESPACE)
            blocks = sorted(blocks, key=lambda b: (round(b[1],1), round(b[0],1)))
            for b in blocks:
                bt = (b[4] or "").strip()
                if bt:
                    paras.append(bt)
        doc.close()
        text = "\n\n".join(paras)

    return clean_text_for_prompt(text, max_chars=max_chars)

#==================================================================
USER_PREFIX = """Extract the following fields as full-text blocks from this clinical case composed of multiple page images. Output ONE JSON object only (no markdown, no extra text). If any section is missing in the case, set it to null."""

# ================== OPENAI CLIENT → LM STUDIO VIA NGROK ==================
from openai import OpenAI
client = OpenAI(base_url=OPENAI_BASE_URL, api_key=OPENAI_API_KEY)

# ================== TIỆN ÍCH PATH & TÊN ==================
def norm_case_id(pdf_path: Path) -> str:
    stem = pdf_path.stem
    if "---" in stem:
        cid = stem.split("---")[0].strip()
        if cid.isdigit():
            return cid.zfill(3)
    return re.sub(r"[^0-9A-Za-z]+", "_", stem)[:32]

def ensure_case_dirs(case_root: Path):
    (case_root / "pages").mkdir(parents=True, exist_ok=True)
    (case_root / "images").mkdir(parents=True, exist_ok=True)

# ================== PDF → PNG (mỗi trang) ==================
def render_pdf_pages_to_pngs(pdf_path: Path, pages_dir: Path, dpi: int = 300) -> List[Path]:
    pages = convert_from_path(str(pdf_path), dpi=dpi)   # nếu chạy local: cần Poppler cài sẵn
    out_paths = []
    for i, img in enumerate(pages, 1):
        out_path = pages_dir / f"{pages_dir.parent.name}_page_{i}.png"
        img.save(out_path)
        out_paths.append(out_path)
    return out_paths

# ================== TRÍCH ẢNH/FIGURE TỪ PDF ==================
def extract_inline_images(pdf_path: Path, images_dir: Path, dpi: int = 300) -> List[Dict[str, Any]]:
    metas = []
    doc = fitz.open(str(pdf_path))
    try:
        for pno in range(len(doc)):
            page = doc[pno]
            d = page.get_text("dict")
            bi = 0
            for block in d.get("blocks", []):
                if block.get("type") != 1:
                    continue
                bi += 1
                x0, y0, x1, y1 = block["bbox"]
                rect = fitz.Rect(x0, y0, x1, y1)
                pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72), clip=rect, alpha=False)
                out_name = f"{images_dir.parent.name}_p{pno+1}_fig_{bi}.png"
                out_path = images_dir / out_name
                pix.save(str(out_path))
                metas.append({
                    "page_number": pno+1,
                    "figure_index": bi,
                    "bbox": [x0, y0, x1, y1],
                    "path": str(out_path),
                })
    finally:
        doc.close()
    return metas

# ================== ẢNH → data URL ==================
def encode_image_base64(img_path: Path) -> str:
    with Image.open(img_path) as im:
        im = im.convert("RGB")
        buf = io.BytesIO()
        im.save(buf, format="PNG")
        return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode("utf-8")

# ================== GỌI VLM (LM Studio) → 1 JSON/CASE ==================
def call_vlm_extract_case(page_png_paths, model_name=MODEL_NAME,
                          max_pages=MAX_PAGES_FOR_VLM, retries=RETRIES,
                          pdf_path=None, case_id="unknown"):
    # chọn 1–2 trang là đủ; bạn đã tăng context nhưng vẫn nên tiết kiệm
    pages = page_png_paths[:2] if len(page_png_paths) >= 2 else page_png_paths

    # text tóm gọn toàn case (đừng để quá dài)
    raw_text = extract_text_all_pages(pdf_path, max_chars=8000) if pdf_path else ""

    def build_messages():
        content = [{"type": "text", "text": USER_PREFIX + "\n\n" + raw_text}]
        for p in pages:
            content.append({"type": "image_url", "image_url": {"url": encode_image_base64(p)}})
        return [
            {"role": "system", "content": TROPID_SYSTEM_PROMPT},
            {"role": "user", "content": content},
        ]

    last_err = None
    for attempt in range(retries + 1):
        try:
            resp = client.chat.completions.create(
                model=model_name,
                temperature=0,
                top_p=1,
                messages=build_messages(),
            )
            txt = (resp.choices[0].message.content or "").strip()
            _save_debug(case_id, f"raw_attempt{attempt}", txt)   # LƯU LOG THÔ

            obj = _extract_json_loose(txt, JSON_KEYS)            # VỢT JSON
            _save_debug(case_id, f"parsed_attempt{attempt}", json.dumps(obj, ensure_ascii=False, indent=2))
            return obj

        except Exception as e:
            last_err = e
            _save_debug(case_id, f"error_attempt{attempt}", repr(e))
            continue

    # Nếu vẫn fail: làm “repair pass” text-only (không gửi ảnh), rất ngắn
    try:
        repair_prompt = (
            "You earlier produced a non-JSON answer. "
            "Now REWRITE it as ONE valid JSON object with EXACT keys and order:\n"
            + json.dumps({k: None for k in JSON_KEYS}, ensure_ascii=False)
            + "\nNo preamble, no markdown."
        )
        resp2 = client.chat.completions.create(
            model=model_name, temperature=0, top_p=1,
            messages=[{"role":"system","content":TROPID_SYSTEM_PROMPT},
                      {"role":"user","content":[{"type":"text","text":repair_prompt}]}]
        )
        txt2 = (resp2.choices[0].message.content or "").strip()
        _save_debug(case_id, "repair_raw", txt2)
        obj2 = _extract_json_loose(txt2, JSON_KEYS)
        _save_debug(case_id, "repair_parsed", json.dumps(obj2, ensure_ascii=False, indent=2))
        return obj2
    except Exception as e2:
        _save_debug(case_id, "repair_error", repr(e2))
        raise RuntimeError(f"VLM JSON extraction failed (after repair): {last_err}") from e2


# ================== XỬ LÝ 1 PDF ==================
def process_one_pdf(pdf_path: Path, out_root: Path) -> Dict[str, Any]:
    case_id = norm_case_id(pdf_path)
    case_dir = out_root / case_id
    pages_dir = case_dir / "pages"
    images_dir = case_dir / "images"
    ensure_case_dirs(case_dir)

    # 1) PDF → PNG pages
    page_pngs = render_pdf_pages_to_pngs(pdf_path, pages_dir, dpi=DPI_PAGES)

    # 2) Trích figure/ảnh
    images_meta = extract_inline_images(pdf_path, images_dir, dpi=DPI_PAGES)

    # 3) VLM → 11 trường JSON cho TOÀN CASE
    case_json = call_vlm_extract_case(page_pngs, model_name=MODEL_NAME, pdf_path=pdf_path, case_id=case_id)

    # 4) Lưu JSON + manifest
    json_path = case_dir / f"{case_id}.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(case_json, f, ensure_ascii=False, indent=2)

    manifest = {
        "case_id": case_id,
        "source_pdf": str(pdf_path.resolve()),
        "n_pages": len(page_pngs),
        "pages_dir": str(pages_dir.resolve()),
        "images_dir": str(images_dir.resolve()),
        "images_meta": images_meta,
        "json_path": str(json_path.resolve())
    }
    with open(case_dir / "manifest.json", "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    return manifest

# ================== CHẠY HÀNG LOẠT ==================
def run_batch(input_dir: str, out_root: str):
    in_dir = Path(input_dir)
    out_dir = Path(out_root)
    out_dir.mkdir(parents=True, exist_ok=True)

    pdf_files = sorted([p for p in in_dir.glob("*.pdf")])
    results = []
    for pdf in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            results.append(process_one_pdf(pdf, out_dir))
        except Exception as e:
            print(f"[WARN] {pdf.name}: {e}")

    with open(out_dir / "index.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"[DONE] {len(results)}/{len(pdf_files)} processed → {out_dir}")

# ================== RUN ==================
if __name__ == "__main__":
    run_batch(INPUT_DIR, OUT_ROOT)


Processing PDFs:   1%|          | 1/93 [00:35<54:50, 35.77s/it]

[WARN] 1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf: VLM JSON extraction failed (after repair): empty VLM output


Processing PDFs:   1%|          | 1/93 [00:57<1:27:34, 57.12s/it]


KeyboardInterrupt: 

In [None]:
r = client.models.list()
print([m.id for m in r.data])


NameError: name 'client' is not defined

# new extract data to 11 fields jsons

## delete old jsons

In [None]:
from pathlib import Path

EXTRACTED_DIR = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")

deleted = 0
for case_dir in sorted([p for p in EXTRACTED_DIR.iterdir() if p.is_dir() and p.name.isdigit()]):
    cid = case_dir.name
    for f in [case_dir / f"{cid}.json", case_dir / "manifest.json"]:
        if f.exists():
            f.unlink()
            deleted += 1
print(f"Deleted {deleted} files from {EXTRACTED_DIR}")


Deleted 2 files from /content/drive/MyDrive/Project-AI/Data/extracted_data


In [None]:
from pathlib import Path

EXTRACTED_DIR = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")

deleted = 0
for p in EXTRACTED_DIR.rglob("manifest.json.bak"):
    try:
        p.unlink()
        deleted += 1
        print(f"Deleted: {p}")
    except Exception as e:
        print(f"[WARN] Cannot delete {p}: {e}")

print(f"\nDone. Deleted {deleted} manifest.json.bak file(s).")



Done. Deleted 0 manifest.json.bak file(s).


## config

In [None]:
# ======== CONFIG BẮT BUỘC (đặt ở gần đầu file) ========
from pathlib import Path

EXTRACTED_DIR   = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")
PDF_DATA_DIR    = Path("/content/drive/MyDrive/Project-AI/Data/pdf-data")

MAX_PAGES_FOR_VLM = 3
RETRIES           = 2
TEMPERATURE       = 0.0
PROMPT_VERSION    = "tropid-json-v4-no-fewshot"

# LM Studio qua ngrok (đổi URL đúng của bạn)
NGROK_BASE_URL   = "https://tammara-stalkless-portentously.ngrok-free.dev"
OPENAI_BASE_URL  = f"{NGROK_BASE_URL}/v1"
OPENAI_API_KEY   = "lm-studio"               # dummy theo LM Studio
MODEL_NAME       = "qwen2-vl-7b-instruct"


## extract


In [None]:
# -*- coding: utf-8 -*-
import os, re, io, json, base64, requests, time, hashlib
from pathlib import Path
from typing import List, Dict, Any, Tuple
from PIL import Image
from datetime import datetime, timezone
import requests, base64, io



# ======== PROMPT (đÃ SỬA: KHÔNG few-shot, CẤM “Question/Answer”, CHỈ DÙNG TEXT TỪ ẢNH) ========
SYSTEM_PROMPT = r"""
You are TropID-Extractor.

Read ONLY the text visible in the provided PAGE IMAGES of ONE clinical case
(including any section titled exactly “The Case Continued”). Ignore exam-style
headings such as “Question”, “Answer”, “Discussion”, “Teaching points”,
“References”, and figure captions.

Return ONE raw JSON object with EXACTLY these keys (same order). Each value is
EITHER a single coherent paragraph (string) OR null. Do not return lists,
nested objects, or pseudo-dicts (e.g., "{'x':'y'}"). Write plain prose; keep
units/values as in the images. If a field is absent, set null. Do not guess.

Order and keys (use exactly these):
{
  "patient_information": null,
  "chief_complaint": null,
  "history_of_present_illness": null,
  "exposure_and_epidemiology": null,
  "vitals": null,
  "physical_exam": null,
  "labs_and_diagnostics": null,
  "differential_diagnosis": null,
  "management_and_clinical_course": null,
  "final_diagnosis": null,
  "disease_name_short": null
}

Schema guidance (do not output this section; it is guidance only):
{
  "type": "object",
  "properties": {
    "patient_information": {
      "type": ["string", "null"],
      "description": "Age/sex; relevant comorbidities or immunosuppression; vaccination/allergy information if stated."
    },
    "chief_complaint": {
      "type": ["string", "null"],
      "description": "One-line statement of the main problem and its duration."
    },
    "history_of_present_illness": {
      "type": ["string", "null"],
      "description": "Timeline; key symptoms; pertinent negatives; pattern/severity."
    },
    "exposure_and_epidemiology": {
      "type": ["string", "null"],
      "description": "Residence/travel (places and dates if available); vectors (mosquito/tick); animals; water/food risks; sick contacts; season/outbreak context; occupation."
    },
    "vitals": {
      "type": ["string", "null"],
      "description": "All reported vital signs with values: temperature, blood pressure, heart rate, respiratory rate, SpO2."
    },
    "physical_exam": {
      "type": ["string", "null"],
      "description": "Salient findings by system: skin, HEENT, chest, abdomen, neurologic, lymph nodes, etc."
    },
    "labs_and_diagnostics": {
      "type": ["string", "null"],
      "description": "CBC trends; key chemistry/coagulation; inflammatory markers; microbiology/serology/PCR (assay + result + titer/CT if provided); malaria tests; imaging highlights."
    },
    "differential_diagnosis": {
      "type": ["string", "null"],
      "description": "Concise narrative of main alternatives; one sentence each for reasons for/against."
    },
    "management_and_clinical_course": {
      "type": ["string", "null"],
      "description": "Antimicrobials (drug/dose if given), supportive care, procedures; response, complications, outcome."
    },
    "final_diagnosis": {
      "type": ["string", "null"],
      "description": "3–5 sentences: explicit disease name ± causative agent, confirmation method (e.g., NS1, thick smear with species, PCR/serology), and why alternatives were ruled out."
    },
    "disease_name_short": {
      "type": ["string", "null"],
      "description": "Disease name only (no agent, no method)."
    }
  }
}

VERBATIM-LEAKAGE GUARDRAILS
- The block <EXAMPLES> below is for style and structure ONLY. Do not quote, paraphrase, or reuse any specific phrases, numbers, names, dates, places, labs, or diagnoses from it.
- Use facts EXCLUSIVELY from <CURRENT_CASE>. If a detail is not present in <CURRENT_CASE>, set that field to null.
- NEVER mention or refer to EXAMPLES in your output. Output must be a single JSON object with the exact keys above, nothing else.
- If the case text contains no information for a field, return null for that field rather than inferring or fabricating.

<EXAMPLES>
  <example>
    <input>
      32-year-old woman returned from rural Thailand 7 days ago. High fever to 39.5 °C, severe myalgias, retro-orbital pain, and maculopapular rash. Platelets 72×10^9/L, leukopenia. Dengue NS1 antigen positive on day 2. Managed with fluids and acetaminophen; no antibiotics. Recovered by day 6.
    </input>
    <output>
      {
        "patient_information": "32-year-old woman with no reported comorbidities or immunosuppression.",
        "chief_complaint": "Acute high fever with rash for approximately one week.",
        "history_of_present_illness": "Abrupt febrile illness reaching 39.5 °C with severe myalgias and retro-orbital pain, followed by a maculopapular rash. Symptoms began shortly after travel and evolved over 7 days.",
        "exposure_and_epidemiology": "Recent travel to rural Thailand one week prior; mosquito exposure is implied by the endemic setting. No animal or water/food exposures or sick contacts were reported.",
        "vitals": "Maximum recorded temperature 39.5 °C; other vital signs not reported.",
        "physical_exam": "Maculopapular rash and retro-orbital discomfort; no focal neurologic deficits or respiratory distress described.",
        "labs_and_diagnostics": "CBC notable for thrombocytopenia (platelets 72×10^9/L) and leukopenia. Dengue NS1 antigen positive on day 2. No imaging was reported.",
        "differential_diagnosis": "Considered dengue fever versus chikungunya and malaria. Dengue supported by NS1 positivity, thrombocytopenia, and travel to an endemic area. Malaria less likely given dengue confirmation and no malaria-specific findings reported.",
        "management_and_clinical_course": "Supportive care with oral/IV fluids and acetaminophen; no antibiotics. Clinical improvement with full recovery by day 6; no complications documented.",
        "final_diagnosis": "Dengue fever due to dengue virus, confirmed by a positive NS1 antigen test. The diagnosis aligns with acute febrile illness, thrombocytopenia, retro-orbital pain, and compatible travel exposure. Malaria and chikungunya were considered less likely given confirmatory testing and symptom pattern.",
        "disease_name_short": "Dengue fever"
      }
    </output>
  </example>
</EXAMPLES>

When you receive the actual case text, it will be placed inside the <CURRENT_CASE> ... </CURRENT_CASE> block by the user message.
Only use information inside that block to produce the JSON.

Your final answer must be:
- a single JSON object
- exactly the 11 keys, in the exact order listed
- string values or null
- no markdown, no extra commentary, no keys beyond those 11
""".strip()

USER_PREFIX = (
    "Extract the following fields as full-text blocks from this clinical case composed of multiple page images. "
    "Focus ONLY on the case narrative (presentation, exam, investigations, management, outcome). "
    "IGNORE any 'Question'/'Answer'/'Discussion'/'Teaching points' sections if they exist. "
    "Return ONE JSON object only (no markdown)."
)

JSON_KEYS = [
    "patient_information","chief_complaint","history_of_present_illness",
    "exposure_and_epidemiology","vitals","physical_exam","labs_and_diagnostics",
    "differential_diagnosis","management_and_clinical_course","final_diagnosis","disease_name_short"
]

# ======== TIỆN ÍCH ========
def _case_dirs(root: Path) -> List[Path]:
    return sorted([p for p in root.iterdir() if p.is_dir() and p.name.isdigit()])

def _list_pages(pages_dir: Path) -> List[Path]:
    # sort theo số trang trong tên *_page_<n>.png
    def _num(p: Path) -> int:
        m = re.search(r"_page_(\d+)\.png$", p.name)
        return int(m.group(1)) if m else 0
    return sorted(pages_dir.glob("*.png"), key=_num)

def _encode_image_b64(p: Path) -> str:
    with Image.open(p) as im:
        im = im.convert("RGB")
        buf = io.BytesIO()
        im.save(buf, format="PNG")
        return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode("utf-8")

# def _select_pages(all_pages: List[Path], k: int) -> List[Path]:
#     if len(all_pages) <= k:
#         return all_pages
#     # head+tail để tránh phần hỏi đáp/answers ở giữa/cuối
#     head = all_pages[: max(2, k//2)]
#     tail = all_pages[-(k - len(head)) :]
#     return head + tail


def _select_pages(all_pages: List[Path]) -> List[Path]:
    """Chọn ảnh trang để gửi model. Nếu > MAX_PAGES_FOR_VLM: lấy head+tail."""
    k = MAX_PAGES_FOR_VLM
    if len(all_pages) <= k:
        chosen = all_pages
    else:
        head = all_pages[: max(2, k // 2)]
        tail = all_pages[-(k - len(head)):]
        chosen = head + tail
    # DEBUG: chứng minh đúng là lấy từ thư mục pages/
    print(f"[DEBUG] Pages selected ({len(chosen)}/{len(all_pages)}):")
    for p in chosen:
        print("  -", p)  # sẽ in .../extracted_data/<cid>/pages/001_page_1.png ...
    return chosen



import requests, base64, io
from PIL import Image

def _encode_image_b64(p: Path) -> str:
    with Image.open(p) as im:
        im = im.convert("RGB")
        buf = io.BytesIO()
        im.save(buf, format="PNG")
    return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode("utf-8")

def _http_chat(images_b64: List[str]) -> str:
    # nội dung user: 1 text + N ảnh (PNG base64)
    content = [{"type": "text", "text": USER_PREFIX}]
    for url in images_b64:
        content.append({"type": "image_url", "image_url": {"url": url}})

    payload = {
        "model": MODEL_NAME,
        "temperature": TEMPERATURE,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": content},
        ],
    }
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    r = requests.post(f"{OPENAI_BASE_URL}/chat/completions", json=payload, headers=headers, timeout=180)
    r.raise_for_status()
    data = r.json()
    return (data["choices"][0]["message"]["content"] or "").strip()







def _extract_json_loose(text: str) -> Dict[str, Any]:
    # lấy khối JSON lớn nhất
    m = re.search(r"\{.*\}", text, flags=re.S)
    cand = m.group(0) if m else text
    obj = json.loads(cand)
    # ép đúng 11 key, giá trị là str hoặc None
    for k in JSON_KEYS:
        obj.setdefault(k, None)
    obj = {k: (obj[k] if (obj[k] is None or isinstance(obj[k], str)) else str(obj[k]))
           for k in JSON_KEYS}
    return obj

def _strip_didactic_noise(s: str | None) -> str | None:
    if not s:
        return s
    t = s.strip()
    # xoá các dòng mở đầu kiểu Question/Answer/Discussion/Teaching points/References
    lines = [ln for ln in t.splitlines()
             if not re.match(r"^\s*(Question|Answer|Discussion|Teaching points?|References?)\b", ln.strip(), flags=re.I)]
    t = " ".join(ln.strip() for ln in lines)
    # nếu vẫn chứa cấu trúc 'question:' 'answer:' → nhiều khả năng là mục hỏi đáp, bỏ
    if re.search(r"\b(question|answer)\s*[:=]\s*", t, flags=re.I):
        return None
    # gom khoảng trắng
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t or None

def _postprocess_fields(obj: Dict[str, Any]) -> Dict[str, Any]:
    clean = {}
    for k in JSON_KEYS:
        v = obj.get(k, None)
        if isinstance(v, str):
            v = v.strip()
            # loại dấu ```json...``` nếu model dở chứng
            v = re.sub(r"^```json\s*|\s*```$", "", v, flags=re.S)
            v = _strip_didactic_noise(v)
        elif v is not None:
            v = str(v)
            v = _strip_didactic_noise(v)
        clean[k] = v if (v is None or (isinstance(v, str) and v)) else None
    return clean

def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def _find_pdf_for_case(case_id: str) -> Tuple[str | None, str | None, int | None, str | None]:
    # tìm file trong PDF_DATA_DIR bắt đầu bằng "<id>---"
    pattern = f"{int(case_id):03d}---*.pdf"
    files = sorted(PDF_DATA_DIR.glob(pattern))
    if not files:
        return None, None, None, None
    pdf = files[0]
    title = re.sub(r"^\d+\s*---\s*", "", pdf.stem)
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title).strip("- _")
    # hash + size
    try:
        h = hashlib.sha256()
        with open(pdf, "rb") as f:
            for chunk in iter(lambda: f.read(1 << 20), b""):
                h.update(chunk)
        sha = h.hexdigest()
        size = pdf.stat().st_size
    except Exception:
        sha, size = None, None
    return str(pdf), title, size, sha

def _save_json(path: Path, obj: Dict[str, Any]):
    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")



def extract_case_from_pages(case_dir: Path) -> Dict[str, Any]:
    cid = case_dir.name
    pages_dir = case_dir / "pages"
    assert pages_dir.exists(), f"Missing pages dir: {pages_dir}"
    all_pages = _list_pages(pages_dir)
    assert all_pages, f"No page PNGs in {pages_dir}"

    chosen = _select_pages(all_pages)
    imgs_b64 = [_encode_image_b64(p) for p in chosen]

    # 2 chiến lược: head+tail (mặc định), rồi head_block (các trang đầu liên tiếp)
    strategies = [
        ("head_tail", imgs_b64),
        ("head_block",
         [_encode_image_b64(p) for p in all_pages[:MAX_PAGES_FOR_VLM]]
         if len(all_pages) > MAX_PAGES_FOR_VLM else imgs_b64),
    ]

    last_err, raw_text = None, ""
    for _, imgset in strategies:
        for _ in range(RETRIES + 1):
            try:
                raw_text = _http_chat(imgset)
                obj = _extract_json_loose(raw_text)
                obj = _postprocess_fields(obj)
                if sum(1 for v in obj.values() if v) == 0:
                    raise ValueError("all-null")
                return obj
            except Exception as e:
                last_err = e
                time.sleep(0.8)
    raise RuntimeError(f"VLM JSON extraction failed for case {cid}: {last_err}")

def run_all():
    logs_dir = EXTRACTED_DIR / "_vlm_logs"
    logs_dir.mkdir(parents=True, exist_ok=True)
    cases = _case_dirs(EXTRACTED_DIR)

    ok = 0
    for case_dir in cases:
        cid = case_dir.name
        try:
            obj = extract_case_from_pages(case_dir)
            pdf_path, case_title, pdf_size, pdf_sha = _find_pdf_for_case(cid)

            case_json = case_dir / f"{cid}.json"
            _save_json(case_json, obj)

            manifest = {
                "case_id": cid,
                "case_title": case_title,
                "source_pdf": pdf_path,
                "pages_dir": str((case_dir / "pages").resolve()),
                "images_dir": str((case_dir / "images").resolve()),
                "n_pages": len(list((case_dir / "pages").glob("*.png"))),
                "json_path": str(case_json.resolve()),
                "extract_model": MODEL_NAME,      # <— dùng đúng tên
                "prompt_version": PROMPT_VERSION,
                "created_at": _now_iso(),
                "pdf_filesize": pdf_size,
                "pdf_sha256": pdf_sha,
                "errors": []
            }
            _save_json(case_dir / "manifest.json", manifest)
            ok += 1
        except Exception as e:
            (logs_dir / f"{cid}.error.txt)").write_text(str(e), encoding="utf-8")
            try:
                (logs_dir / f"{cid}.raw.txt").write_text(raw_text, encoding="utf-8")
            except Exception:
                pass
            print(f"[WARN] {cid}: {e}")
    print(f"[DONE] {ok}/{len(cases)} cases extracted → {EXTRACTED_DIR}")

run_all()



[DEBUG] Pages selected (3/3):
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/001/pages/001_page_1.png
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/001/pages/001_page_2.png
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/001/pages/001_page_3.png
[DEBUG] Pages selected (3/3):
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/002/pages/002_page_1.png
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/002/pages/002_page_2.png
  - /content/drive/MyDrive/Project-AI/Data/extracted_data/002/pages/002_page_3.png


KeyboardInterrupt: 

#Modifiy mainifest.json from wrong path with shortcur issue to the new correct one

In [None]:
import json, re
from pathlib import Path

# === cấu hình ===
PROJECT_ROOT = Path("/content/drive/MyDrive/Project-AI/Data")   # thư mục Data thật
EXTRACT_ROOT = PROJECT_ROOT / "extracted_data"                  # nơi chứa các case
NEW_ROOT = str(PROJECT_ROOT) + "/"                              # chuỗi thay thế
# Mẫu prefix của Google Drive shortcut (.shortcut-targets-by-id/<random>/data/)
SHORTCUT_PREFIX_RE = re.compile(r"^/content/drive/\.shortcut-targets-by-id/[^/]+/data/")

def rebase_path(p: str) -> str:
    if not isinstance(p, str):
        return p
    return SHORTCUT_PREFIX_RE.sub(NEW_ROOT, p)

def fix_manifest_file(manifest_path: Path) -> bool:
    try:
        raw = manifest_path.read_text(encoding="utf-8")
        data = json.loads(raw)
    except Exception as e:
        print(f"[SKIP] {manifest_path}: cannot read JSON ({e})")
        return False

    changed = False

    # các key mức 1
    for key in ["source_pdf", "pages_dir", "images_dir", "json_path"]:
        if key in data and isinstance(data[key], str):
            newv = rebase_path(data[key])
            if newv != data[key]:
                data[key] = newv
                changed = True

    # các path trong images_meta[]
    if "images_meta" in data and isinstance(data["images_meta"], list):
        for m in data["images_meta"]:
            if isinstance(m, dict) and "path" in m and isinstance(m["path"], str):
                newp = rebase_path(m["path"])
                if newp != m["path"]:
                    m["path"] = newp
                    changed = True

    if not changed:
        return False

    # backup rồi ghi đè
    bak = manifest_path.with_suffix(".json.bak")
    bak.write_text(raw, encoding="utf-8")
    manifest_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

    # kiểm tra tồn tại cơ bản
    for k in ["pages_dir", "images_dir"]:
        v = data.get(k)
        if isinstance(v, str) and not Path(v).exists():
            print(f"[WARN] {manifest_path.name}: missing dir -> {v}")
    for k in ["source_pdf", "json_path"]:
        v = data.get(k)
        if isinstance(v, str) and not Path(v).exists():
            print(f"[WARN] {manifest_path.name}: missing file -> {v}")

    print(f"[OK]   fixed {manifest_path}")
    return True

def fix_all_manifests():
    count = 0
    for mf in sorted(EXTRACT_ROOT.glob("*/manifest.json")):
        if fix_manifest_file(mf):
            count += 1
    print(f"[DONE] Updated {count} manifest(s).")

# (tuỳ chọn) cập nhật index.json ở extracted_data nếu có
def fix_index_json():
    idx = EXTRACT_ROOT / "index.json"
    if not idx.exists():
        return
    raw = idx.read_text(encoding="utf-8")
    new_raw = SHORTCUT_PREFIX_RE.sub(NEW_ROOT, raw)
    if new_raw != raw:
        idx.with_suffix(".json.bak").write_text(raw, encoding="utf-8")
        idx.write_text(new_raw, encoding="utf-8")
        print("[OK]   fixed index.json")
    else:
        print("[OK]   index.json already clean")

# chạy
fix_all_manifests()
fix_index_json()


[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/001/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/002/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/003/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/004/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/005/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/006/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/007/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/008/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/009/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/010/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_data/011/manifest.json
[OK]   fixed /content/drive/MyDrive/Project-AI/Data/extracted_dat

In [None]:
import json, pathlib

def load_case_manifest(manifest_path):
    m = json.loads(pathlib.Path(manifest_path).read_text())
    print("Case:", m["case_id"], "| pages:", m["n_pages"])
    # lấy ảnh figure đầu tiên để hiển thị/embedding
    if m["images_meta"]:
        fig0 = m["images_meta"][0]["path"]
        print("First figure:", fig0)
    return m

# ví dụ
m = load_case_manifest("/content/drive/MyDrive/Project-AI/Data/extracted_data/001/manifest.json")

# kết nối với JSON 11 trường
case = json.loads(pathlib.Path(m["json_path"]).read_text())
print(case["final_diagnosis"])


Case: 001 | pages: 3
First figure: /content/drive/MyDrive/Project-AI/Data/extracted_data/001/images/001_p2_fig_1.png
{'disease_name_short': 'Viral hemorrhagic fever (VHF)', 'confirmation_method': 'Positive ELISA antigen and PCR tests for Ebola virus, negative result for ELISA IgG antibody', 'evidence': "The diagnosis was confirmed by laboratory testing at a specialized laboratory established as part of the international outbreak response. The patient's history and clinical presentation were consistent with an HF syndrome."}


##add and modify case titles to manifest.json

In [None]:
import re, json
from pathlib import Path

EXTRACT_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")

def title_from_filename(pdf_name: str) -> str:
    # ví dụ: "001---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf"
    stem = Path(pdf_name).stem
    m = re.match(r"^\d+\s*---\s*(.+)$", stem)
    title = m.group(1) if m else stem
    # bỏ đuôi xuất bản nếu có
    title = re.sub(r"_?20\d{2}_Clinical[-_\s]Cases[-_\s]in.*$", "", title, flags=re.I)
    # dọn rác dấu gạch/khoảng trắng
    title = re.sub(r"[_\-]{2,}", " ", title).replace("_", " ").strip()
    # viết hoa nhẹ nhàng (giữ chữ số/viết tắt)
    return re.sub(r"\s{2,}", " ", title)

def add_case_title_to_manifests():
    for mf in sorted(EXTRACT_ROOT.glob("*/manifest.json")):
        data = json.loads(mf.read_text(encoding="utf-8"))
        if not data.get("case_title"):
            # ưu tiên từ source_pdf
            src = data.get("source_pdf", "")
            data["case_title"] = title_from_filename(src) if src else f"Case {data.get('case_id','')}"
            mf.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
            print("[OK] added case_title ->", mf.parent.name, ":", data["case_title"])

add_case_title_to_manifests()


[OK] added case_title -> 001 : A-20-Year-Old-Woman-from-Sudan-With-Fever
[OK] added case_title -> 002 : A-7-Year-Old-Girl-from-Peru-With-a-Chron
[OK] added case_title -> 003 : A-26-Year-Old-Woman-from-Malawi-with-Headache-
[OK] added case_title -> 004 : A-4-Year-Old-Girl-from-Uganda-in-a-
[OK] added case_title -> 005 : A-4-Year-Old-Boy-from-Laos-With-a-Lesion-o
[OK] added case_title -> 006 : A-36-Year-Old-Male-Traveller-Returning-from-B
[OK] added case_title -> 007 : A-28-Year-Old-Male-Fisherman-from-Malawi-Wi
[OK] added case_title -> 008 : A-26-Year-Old-Female-Traveller-Returning-from
[OK] added case_title -> 009 : A-52-Year-Old-Man-from-Vietnam-With-Evo
[OK] added case_title -> 010 : A-55-Year-Old-Indigenous-Woman-from-Australia-W
[OK] added case_title -> 011 : A-45-Year-Old-Male-Security-Guard-from-Malawi-
[OK] added case_title -> 012 : A-29-Year-Old-Man-from-The-Gambia-With-G
[OK] added case_title -> 013 : A-16-Year-Old-Girl-from-Malawi-With-Fever
[OK] added case_title -> 014 : A-2

In [None]:
import re, json
from pathlib import Path

EXTRACT_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")

STOPWORDS = {"a","an","and","or","the","of","in","on","for","to","from","with","at","by"}

def smart_titlecase(s: str) -> str:
    words = s.split()
    out = []
    for i, w in enumerate(words):
        lw = w.lower()
        if i > 0 and lw in STOPWORDS:
            out.append(lw)
        else:
            # giữ nguyên từ viết hoa kiểu acronyms (HIV, PCR) hoặc có số
            if re.fullmatch(r"[A-Z0-9\-]+", w):
                out.append(w)
            else:
                out.append(lw.capitalize())
    return " ".join(out)

def prettify_case_title(raw: str) -> str:
    if not raw:
        return raw
    s = raw.strip()

    # chuẩn hóa các loại gạch thành '-' rồi xử lý
    s = s.replace("–", "-").replace("—", "-").replace("_", " ")
    s = re.sub(r"[-\s]+$", "", s)                       # bỏ gạch/khoảng trắng ở cuối
    s = re.sub(r"\s{2,}", " ", s)

    # tách theo khoảng trắng, xử lý từng token để quyết định có giữ '-' hay không
    def fix_token(tok: str) -> str:
        # nếu token có mẫu giữ nguyên gạch nối (chứa số hoặc ALLCAPS hai bên)
        if re.search(r"\d-\d", tok) or re.search(r"[A-Z]{2,}-\d|\d-[A-Z]{2,}", tok) or re.search(r"[A-Z]{2,}-[A-Z]{2,}", tok):
            return tok  # ví dụ COVID-19, HIV-1, HLA-B27
        # nếu là cụm có chữ thường hai bên dấu '-' (thường do tên file), đổi '-' -> ' '
        return tok.replace("-", " ")

    tokens = [fix_token(t) for t in s.split()]
    s = " ".join(tokens)
    s = re.sub(r"\s{2,}", " ", s).strip()

    # bỏ đuôi xuất bản kiểu “_2022 Clinical-Cases-in…”
    s = re.sub(r"\b20\d{2}\s+Clinical\s*[- ]\s*Cases\s*[- ]\s*in.*$", "", s, flags=re.I).strip()

    # gộp số tuổi dạng “A-26-Year-Old” sau khi tách gạch
    s = re.sub(r"\b(\d+)\s*-\s*Year\s*-\s*Old\b", r"\1-Year-Old", s, flags=re.I)  # nếu muốn giữ 26-Year-Old
    # hoặc muốn bỏ luôn gạch nối trong cụm này:
    s = re.sub(r"\b(\d+)-Year-Old\b", r"\1 Year Old", s, flags=re.I)

    # title-case nhẹ nhàng
    s = smart_titlecase(s)

    # dọn khoảng trắng cuối
    s = s.strip()
    return s

def rewrite_manifests():
    for mf in sorted(EXTRACT_ROOT.glob("*/manifest.json")):
        data = json.loads(mf.read_text(encoding="utf-8"))

        # nếu thiếu case_title hoặc còn nhiều gạch nối -> làm sạch lại
        raw_title = data.get("case_title") or Path(data.get("source_pdf","")).stem
        pretty = prettify_case_title(raw_title)

        changed = False
        if data.get("case_title") != pretty:
            data["case_title"] = pretty
            changed = True

        if changed:
            mf.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
            print(f"[OK] {mf.parent.name}: '{raw_title}'  ->  '{pretty}'")

rewrite_manifests()


[OK] 001: 'A-20-Year-Old-Woman-from-Sudan-With-Fever'  ->  'A 20 Year Old Woman from Sudan with Fever'
[OK] 002: 'A-7-Year-Old-Girl-from-Peru-With-a-Chron'  ->  'A 7 Year Old Girl from Peru with a Chron'
[OK] 003: 'A-26-Year-Old-Woman-from-Malawi-with-Headache-'  ->  'A 26 Year Old Woman from Malawi with Headache'
[OK] 004: 'A-4-Year-Old-Girl-from-Uganda-in-a-'  ->  'A 4 Year Old Girl from Uganda in a'
[OK] 005: 'A-4-Year-Old-Boy-from-Laos-With-a-Lesion-o'  ->  'A 4 Year Old Boy from Laos with a Lesion O'
[OK] 006: 'A-36-Year-Old-Male-Traveller-Returning-from-B'  ->  'A 36 Year Old Male Traveller Returning from B'
[OK] 007: 'A-28-Year-Old-Male-Fisherman-from-Malawi-Wi'  ->  'A 28 Year Old Male Fisherman from Malawi Wi'
[OK] 008: 'A-26-Year-Old-Female-Traveller-Returning-from'  ->  'A 26 Year Old Female Traveller Returning from'
[OK] 009: 'A-52-Year-Old-Man-from-Vietnam-With-Evo'  ->  'A 52 Year Old Man from Vietnam with Evo'
[OK] 010: 'A-55-Year-Old-Indigenous-Woman-from-Australia-W'  

#Clean jsons

In [None]:
# --- QC & Clean TropID JSONs ---
import json, re, csv
from pathlib import Path

IN_ROOT  = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")
OUT_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/cleaned_data")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

JSON_KEYS = [
    "patient_information","chief_complaint","history_of_present_illness",
    "exposure_and_epidemiology","vitals","physical_exam","labs_and_diagnostics",
    "differential_diagnosis","management_and_clinical_course","final_diagnosis","disease_name_short"
]

# ---- helpers ----
def squish_ws(s:str)->str:
    s = s.replace("\u00A0"," ")               # non-breaking space
    s = re.sub(r"[ \t]+"," ", s)
    s = re.sub(r" *\n *","\n", s)
    s = re.sub(r"\n{3,}","\n\n", s)
    return s.strip()

def normalize_units(s:str)->str:
    if not s: return s
    # mmHg / bpm / °C / °F spacing
    s = re.sub(r"(?<=\d)(?=mmHg)\b", " ", s, flags=re.I)
    s = re.sub(r"(?<=\d)(?=bpm)\b",  " ", s, flags=re.I)
    s = s.replace("° C","°C").replace("° F","°F")
    # 90/60mmHg -> 90/60 mmHg
    s = re.sub(r"(\d+/\d+)\s*(mmHg)", r"\1 \2", s, flags=re.I)
    return s

def dictlike_to_paragraph(s:str)->str:
    """
    Nếu model trả chuỗi kiểu "{'a': 'x', 'b': 'y'}" → chuyển thành 1 đoạn văn.
    """
    if not s or "{" not in s or "}" not in s: return s
    txt = s.strip()
    # cố gắng chuyển ' thành " để json.loads
    try:
        # bảo toàn số/đơn vị: thay quote đơn bằng đôi một cách an toàn
        jtxt = re.sub(r"'", '"', txt)                         # thô nhưng hiệu quả cho trường hợp này
        jtxt = re.sub(r'"\s*([A-Za-z0-9_]+)\s*"\s*:', r'"\1":', jtxt)  # dọn key
        obj = json.loads(jtxt)
        if isinstance(obj, dict):
            items = []
            for k, v in obj.items():
                k2 = k.replace("_"," ").strip().capitalize()
                if isinstance(v, (list, dict)): v = json.dumps(v, ensure_ascii=False)
                items.append(f"{k2}: {str(v).strip()}")
            return " ".join(items)
        elif isinstance(obj, list):
            return "; ".join(str(x) for x in obj)
    except Exception:
        return s
    return s

def canonical_disease(name:str, context:str)->str:
    """
    Rút gọn disease_name_short -> vocab ngắn.
    Không suy đoán quá đà: chỉ cắt ngoặc và map một số từ khoá phổ biến.
    """
    if not name: return None
    name = squish_ws(name)
    # bỏ ngoặc
    name = re.sub(r"\s*\([^)]*\)\s*$","", name).strip()
    low = name.lower()

    # map tối thiểu; mở rộng thêm nếu cần
    vocab = {
        "dengue": "Dengue fever",
        "malaria": "Malaria",
        "falciparum": "Falciparum malaria",
        "vivax": "Vivax malaria",
        "scrub typhus": "Scrub typhus",
        "leptospirosis": "Leptospirosis",
        "ebola": "Ebola virus disease",
        "vhf": "Viral hemorrhagic fever",
        "viral hemorrhagic fever": "Viral hemorrhagic fever",
        "tuberculosis": "Tuberculosis",
    }
    for key, val in vocab.items():
        if key in low:
            return val

    # nếu không match, trả lại bản rút gọn
    return name

def clean_block(s:str)->str:
    if s is None: return None
    s = dictlike_to_paragraph(s)
    s = squish_ws(s)
    s = normalize_units(s)
    return s or None

# ---- main pass ----
qc_rows = []
cases = sorted((p.parent.name, p) for p in IN_ROOT.glob("*/*.json") if p.name.endswith(".json") and p.name != "manifest.json")

for case_id, jpath in cases:
    try:
        data = json.loads(jpath.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"[SKIP] {jpath}: {e}")
        continue

    cleaned = {}
    for k in JSON_KEYS:
        val = data.get(k)
        cleaned[k] = clean_block(val)

    # disease_name_short rút gọn bằng ngữ cảnh từ final_diagnosis
    cleaned["disease_name_short"] = canonical_disease(cleaned.get("disease_name_short"), cleaned.get("final_diagnosis") or "")

    # thống kê QC
    null_cnt = sum(1 for k in JSON_KEYS if cleaned.get(k) in (None, ""))
    lens = {k: (0 if cleaned.get(k) is None else len(cleaned.get(k))) for k in JSON_KEYS}

    # ghi file
    out_file = OUT_ROOT / f"{case_id}.json"
    out_file.write_text(json.dumps(cleaned, ensure_ascii=False, indent=2), encoding="utf-8")

    qc_rows.append({
        "case_id": case_id,
        "null_fields": null_cnt,
        **{f"len_{k}": v for k,v in lens.items()}
    })

# ghi QC CSV
with (OUT_ROOT / "qc_report.csv").open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=qc_rows[0].keys() if qc_rows else ["case_id","null_fields"])
    w.writeheader()
    w.writerows(qc_rows)

print(f"[DONE] Cleaned {len(qc_rows)} cases → {OUT_ROOT}")


[DONE] Cleaned 93 cases → /content/drive/MyDrive/Project-AI/Data/cleaned_data


## Polish data

In [None]:
import os, json, requests, time, re
from pathlib import Path

IN_DIR  = Path("/content/drive/MyDrive/Project-AI/Data/cleaned_data")
OUT_DIR = Path("/content/drive/MyDrive/Project-AI/Data/polished_data_ai")
OUT_DIR.mkdir(parents=True, exist_ok=True)

JSON_KEYS = [
  "patient_information","chief_complaint","history_of_present_illness",
  "exposure_and_epidemiology","vitals","physical_exam","labs_and_diagnostics",
  "differential_diagnosis","management_and_clinical_course","final_diagnosis","disease_name_short"
]

LM_URL = os.getenv("LM_STUDIO_URL", "https://tammara-stalkless-portentously.ngrok-free.dev")  # đổi URL của bạn
MODEL  = os.getenv("LM_MODEL_NAME", "qwen2-vl-7b-instruct")  # tên model trong LM Studio

def call_refiner(field_name:str, content:str, max_retries=2)->str|None:
    if content is None or not str(content).strip():
        return None
    system = ("You are a clinical text rewriter. Rewrite the provided content into one cohesive paragraph in fluent English. "
              "Do not add new facts. Keep all numbers, units, names, and negations as-is. "
              "If input looks like “Label: value” pairs, convert them into natural sentences. "
              "If the input has no real content, respond exactly with null.")
    user = f"Field name: {field_name}\nContent: {content}"
    for _ in range(max_retries+1):
        try:
            r = requests.post(
                f"{LM_URL}/v1/chat/completions",
                headers={"Content-Type":"application/json"},
                json={
                    "model": MODEL,
                    "temperature": 0,
                    "messages": [
                        {"role":"system","content":system},
                        {"role":"user","content":user}
                    ],
                    "max_tokens": 400
                },
                timeout=120
            )
            r.raise_for_status()
            txt = r.json()["choices"][0]["message"]["content"].strip()
            if txt.lower() == "null":
                return None
            # loại bọc ```...```
            txt = re.sub(r"^```[a-zA-Z]*\s*|\s*```$", "", txt, flags=re.S)
            return txt
        except Exception as e:
            time.sleep(1.0)
    return None  # sẽ fallback

# fallback: dùng deterministic formatter ở trên
def fallback_polish(s:str)->str|None:
    import re, json
    if s is None: return None
    # chuyển chuỗi-dict nếu có
    if s.strip().startswith("{") and s.strip().endswith("}"):
        try:
            j = json.loads(re.sub(r"'", '"', s))
            if isinstance(j, dict):
                parts = [f"{k.replace('_',' ').strip().capitalize()}: {str(v).strip()}" for k,v in j.items()]
                s = " ".join(parts)
        except Exception:
            pass
    # “Label: value” → câu
    pairs = list(re.finditer(r"([A-Z][A-Za-z0-9()/\- ]+?):\s*([^:]+?)(?=(?:\s+[A-Z][A-Za-z0-9()/\- ]+?:)|$)", s))
    if pairs:
        s = " ".join(
            (f"{m.group(1).strip().rstrip('.')}" + " " + f"{m.group(2).strip().rstrip('.')}.")
            for m in pairs
        )
    return s.strip() or None

for jpath in sorted(IN_DIR.glob("*.json")):
    data = json.loads(jpath.read_text(encoding="utf-8"))
    out  = {}
    for k in JSON_KEYS:
        raw = data.get(k)
        refined = call_refiner(k, raw)
        if refined is None:
            refined = fallback_polish(raw)
        out[k] = refined
    (OUT_DIR / jpath.name).write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"Polished (AI) → {OUT_DIR}")


KeyboardInterrupt: 

#test OCr


In [None]:
# Cài đặt thư viện
!pip install -q PyPDF2


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from PyPDF2 import PdfReader
from pathlib import Path

pdf = Path("/content/drive/MyDrive/Project-AI/Data/pdf-data/1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf")
reader = PdfReader(str(pdf))
for i, page in enumerate(reader.pages, 1):
    txt = page.extract_text() or ""
    print(f"Page {i}: {len(txt)} chars")


Page 1: 3369 chars
Page 2: 5377 chars
Page 3: 462 chars


#Extract raw text

In [None]:
from pathlib import Path

INPUT_DIR  = Path("/content/drive/MyDrive/Project-AI/Data/pdf-data")
OUTPUT_DIR = Path("/content/drive/MyDrive/Project-AI/Data/raw_text_data_2")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Input:", INPUT_DIR)
print("Output:", OUTPUT_DIR)


Input: /content/drive/MyDrive/Project-AI/Data/pdf-data
Output: /content/drive/MyDrive/Project-AI/Data/raw_text_data_2


In [None]:
import re, fitz  # PyMuPDF

def normalize_ws(s: str) -> str:
    s = re.sub(r'[ \t]+', ' ', s)      # gộp space
    s = re.sub(r'\n{3,}', '\n\n', s)   # tối đa 1 dòng trống
    return s.strip()

def extract_page_text_two_columns(page: "fitz.Page") -> str:
    """
    Heuristic cho trang 2 cột:
      - lấy blocks -> chia theo median x-center thành left/right
      - sort theo (y0, x0) trong từng cột
      - nối left rồi right
    Nếu block rỗng -> fallback get_text('text')
    """
    blocks = page.get_text("blocks")
    if not blocks:
        return normalize_ws(page.get_text("text") or "")

    cleaned = []
    for b in blocks:
        if len(b) >= 5 and isinstance(b[4], str) and b[4].strip():
            x0, y0, x1, y1, text = b[:5]
            cleaned.append((x0, y0, x1, y1, text))
    if not cleaned:
        return normalize_ws(page.get_text("text") or "")

    centers = [ (b[0]+b[2])/2 for b in cleaned ]
    centers_sorted = sorted(centers)
    mid = centers_sorted[len(centers_sorted)//2]

    left  = [b for b in cleaned if ((b[0]+b[2])/2) <= mid]
    right = [b for b in cleaned if ((b[0]+b[2])/2) >  mid]

    left.sort(key=lambda b: (b[1], b[0]))
    right.sort(key=lambda b: (b[1], b[0]))

    def join(blks):
        return "\n\n".join(normalize_ws(b[4]) for b in blks if b[4].strip())

    left_text, right_text = join(left), join(right)
    if not left_text or not right_text:  # có thể là 1 cột
        return normalize_ws(page.get_text("text") or "")
    return f"{left_text}\n\n{right_text}"


In [None]:
from tqdm import tqdm

pdfs = sorted([p for p in INPUT_DIR.glob("*.pdf") if p.is_file()])
print(f"Found {len(pdfs)} PDFs")

total_pages = 0
for idx, pdf_path in enumerate(tqdm(pdfs, desc="Extracting"), 1):
    try:
        doc = fitz.open(pdf_path)
        parts = []
        for i, page in enumerate(doc, 1):
            text = extract_page_text_two_columns(page)
            parts.append(f"=== PAGE {i} ===\n{text}\n")
        out_txt = "\n".join(parts).strip() + "\n"

        out_path = OUTPUT_DIR / (pdf_path.stem + ".txt")
        out_path.write_text(out_txt, encoding="utf-8")
        total_pages += len(doc)
    except Exception as e:
        print(f"[ERROR] {pdf_path.name}: {e}")

print(f"Done. Wrote {len(pdfs)} .txt files, {total_pages} pages total.")


Found 93 PDFs


Extracting: 100%|██████████| 93/93 [04:38<00:00,  3.00s/it]

Done. Wrote 93 .txt files, 261 pages total.





In [None]:
!pip -q install pymupdf scikit-learn

import re, fitz
from sklearn.cluster import KMeans

def _normalize_ws(s: str) -> str:
    # ghép từ bị ngắt dòng bằng dấu gạch nối
    s = re.sub(r'(\w)-\n(\w)', r'\1\2', s)
    # thay \r, làm gọn space & dòng trống
    s = s.replace('\r', '')
    s = re.sub(r'[ \t]+', ' ', s)
    s = re.sub(r'\n{3,}', '\n\n', s)
    return s.strip()

def _block_text_from_dict(block):
    """Ghép text của 1 block (PyMuPDF get_text('dict'))"""
    pieces = []
    for line in block.get('lines', []):
        spans = line.get('spans', [])
        if spans:
            pieces.append("".join(span.get('text', '') for span in spans))
    return "\n".join(pieces)

def extract_page_text_columns_kmeans(page: "fitz.Page",
                                     full_width_ratio: float = 0.60,
                                     join_with_blank: bool = True) -> str:
    """
    - Lấy layout dạng dict
    - Tách 2 cột bằng KMeans(n_clusters=2) theo x-mid của block
    - Khối full-width (rộng > full_width_ratio * page_width) được chèn theo Y0
    - Xuất 1 cột: trái -> phải, chèn full-width theo vị trí Y
    """
    W = page.rect.width
    data = page.get_text("dict")
    blocks = data.get("blocks", [])

    # Thu thập block text + bbox
    items = []
    for b in blocks:
        # type=0: text; type=1: image
        if b.get("type") != 0:
            continue
        text = _block_text_from_dict(b).strip()
        if not text:
            continue
        x0, y0, x1, y1 = b.get("bbox", [0,0,0,0])
        w = max(1e-6, x1 - x0)
        xmid = (x0 + x1) / 2.0
        items.append({
            "text": _normalize_ws(text),
            "x0": x0, "y0": y0, "x1": x1, "y1": y1,
            "w": w, "xmid": xmid,
            "full": (w / W) >= full_width_ratio
        })

    if not items:
        return _normalize_ws(page.get_text("text") or "")

    # Tách full-width vs non-full
    full_blocks = [it for it in items if it["full"]]
    non_full   = [it for it in items if not it["full"]]

    # Nếu thực ra là 1 cột → trả text thô
    if len(non_full) <= 2:
        raw = page.get_text("text") or "\n".join(it["text"] for it in items)
        return _normalize_ws(raw)

    # KMeans trên xmid cho non_full (2 cột)
    X = [[it["xmid"]] for it in non_full]
    kmeans = KMeans(n_clusters=2, n_init=10, random_state=0).fit(X)
    labels = kmeans.labels_

    # Xác định cột trái/phải theo tâm cụm
    centers = kmeans.cluster_centers_.flatten().tolist()
    left_label  = centers.index(min(centers))
    right_label = 1 - left_label

    left  = [it for it, lb in zip(non_full, labels) if lb == left_label]
    right = [it for it, lb in zip(non_full, labels) if lb == right_label]

    # Sort từng cột theo (y0, x0)
    left.sort(key=lambda it: (it["y0"], it["x0"]))
    right.sort(key=lambda it: (it["y0"], it["x0"]))

    def join_blocks(arr):
        sep = "\n\n" if join_with_blank else "\n"
        return sep.join([it["text"] for it in arr if it["text"]])

    left_text  = join_blocks(left)
    right_text = join_blocks(right)

    # Chèn full-width theo vị trí Y (trước khi chuyển sang cột phải)
    # Quy ước: full-width có y0 < median_y của cột trái → chèn TRƯỚC left,
    # nằm giữa phạm vi 2 cột → chèn giữa left và right, còn lại → chèn SAU right.
    if left:
        left_y_min, left_y_max = left[0]["y0"], left[-1]["y0"]
    else:
        left_y_min, left_y_max = 0, 0
    if right:
        right_y_min, right_y_max = right[0]["y0"], right[-1]["y0"]
    else:
        right_y_min, right_y_max = 0, 0

    mid_y = (left_y_max + right_y_min) / 2.0 if right else left_y_max

    before_fw = [fw for fw in full_blocks if fw["y0"] < left_y_min]
    middle_fw = [fw for fw in full_blocks if left_y_min <= fw["y0"] <= (right_y_min if right else left_y_max)]
    after_fw  = [fw for fw in full_blocks if fw not in before_fw and fw not in middle_fw]

    # Kết quả 1 cột: (full trước) + LEFT + (full giữa) + RIGHT + (full sau)
    pieces = []
    if before_fw:
        pieces.append("\n\n".join(fw["text"] for fw in sorted(before_fw, key=lambda it: it["y0"])))
    if left_text:
        pieces.append(left_text)
    if middle_fw:
        pieces.append("\n\n".join(fw["text"] for fw in sorted(middle_fw, key=lambda it: it["y0"])))
    if right_text:
        pieces.append(right_text)
    if after_fw:
        pieces.append("\n\n".join(fw["text"] for fw in sorted(after_fw, key=lambda it: it["y0"])))

    final = "\n\n".join([p for p in pieces if p]).strip()
    # fallback nếu vẫn quá ngắn (trường hợp hiếm)
    if len(final) < 50:
        return _normalize_ws(page.get_text("text") or join_blocks(items))
    return final


In [None]:
from tqdm import tqdm
from pathlib import Path

# đảm bảo OUTPUT_DIR tồn tại
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

pdfs = sorted([p for p in INPUT_DIR.glob("*.pdf") if p.is_file()])
print(f"Found {len(pdfs)} PDFs")

total_pages = 0
for idx, pdf_path in enumerate(tqdm(pdfs, desc="Extracting"), 1):
    try:
        doc = fitz.open(pdf_path)
        parts = []
        for i, page in enumerate(doc, 1):
            # 1) cố gắng tuyến tính hóa 2 cột bằng KMeans
            text = extract_page_text_columns_kmeans(page)

            # 2) fallback rất nhẹ: nếu còn quá ngắn, lấy raw text của PyMuPDF
            if len(text) < 50:
                text = (page.get_text("text") or "").strip()

            parts.append(f"=== PAGE {i} ===\n{text}\n")

        out_txt = "\n".join(parts).strip() + "\n"
        out_path = OUTPUT_DIR / (pdf_path.stem + ".txt")
        out_path.write_text(out_txt, encoding="utf-8")

        total_pages += len(doc)
    except Exception as e:
        print(f"[ERROR] {pdf_path.name}: {e}")

print(f"Done. Wrote {len(pdfs)} .txt files, {total_pages} pages total.")


Found 93 PDFs


Extracting: 100%|██████████| 93/93 [00:39<00:00,  2.37it/s]

Done. Wrote 93 .txt files, 261 pages total.





In [None]:
!pip -q install pdfplumber

import pdfplumber
import itertools

def _to_markdown_table(rows):
    """
    rows: list[list[str]] from pdfplumber.extract_tables()
    build a simple Markdown table:
    | col1 | col2 | ... |
    | ---  | ---  | ... |
    | ...  | ...  | ... |
    """
    # sanitize & drop fully-empty columns
    # transpose to find empty columns
    cols = list(itertools.zip_longest(*rows, fillvalue=""))
    keep_idx = [j for j, col in enumerate(cols) if any((c or "").strip() for c in col)]
    cleaned = []
    for r in rows:
        cleaned.append([ (r[j] or "").strip() for j in keep_idx ])

    if not cleaned:
        return None

    # header: use first row if it looks like header, else synthesize
    header = cleaned[0]
    # if header cells too empty, synthesize generic names
    if sum(1 for c in header if c) < max(1, len(header)//2):
        header = [f"Col{j+1}" for j in range(len(header))]
    body = cleaned[1:] if len(cleaned) > 1 else []

    def row_to_md(r): return "| " + " | ".join(r) + " |"
    md = [
        row_to_md(header),
        "| " + " | ".join(["---"]*len(header)) + " |",
    ] + [row_to_md(r) for r in body]
    return "\n".join(md)

def extract_tables_markdown_for_page(pdf_path, page_index):
    """
    Try to extract all tables on a given page via pdfplumber.
    Return a concatenated markdown string (or "" if none).
    """
    md_sections = []
    with pdfplumber.open(pdf_path) as pdf:
        if page_index >= len(pdf.pages):
            return ""
        p = pdf.pages[page_index]
        tables = p.extract_tables(table_settings={
            "vertical_strategy": "lines",     # dùng line-based nếu có lưới
            "horizontal_strategy": "lines",
            "intersection_tolerance": 5,
        }) or []
        # nếu không có lưới, thử stream mode
        if not tables:
            tables = p.extract_tables(table_settings={
                "vertical_strategy": "text",
                "horizontal_strategy": "text",
                "snap_tolerance": 3,
                "join_tolerance": 3,
                "edge_min_length": 20,
                "min_words_vertical": 2,
                "min_words_horizontal": 2,
                "keep_blank_chars": False,
            }) or []

        for idx, rows in enumerate(tables, 1):
            if not rows or sum(len([c for c in r if (c or '').strip()]) for r in rows) < 6:
                continue  # quá ít ô có nội dung → bỏ
            md = _to_markdown_table(rows)
            if md:
                md_sections.append(f"**[Extracted Table {idx}]**\n{md}")
    return "\n\n".join(md_sections)


In [None]:
from tqdm import tqdm
from pathlib import Path

pdfs = sorted([p for p in INPUT_DIR.glob("*.pdf") if p.is_file()])
print(f"Found {len(pdfs)} PDFs")

total_pages = 0
for pdf_path in tqdm(pdfs, desc="Extracting"):
    try:
        doc = fitz.open(pdf_path)
        parts = []
        for i, page in enumerate(doc, 1):
            # 1) text: tuyến tính hóa 2 cột
            body = extract_page_text_columns_kmeans(page)
            if len(body) < 50:
                body = (page.get_text("text") or "").strip()

            # 2) tables: trích bảng bằng pdfplumber → Markdown
            tables_md = extract_tables_markdown_for_page(str(pdf_path), i-1)
            if tables_md:
                page_text = f"=== PAGE {i} ===\n{body}\n\n[Tables]\n{tables_md}\n"
            else:
                page_text = f"=== PAGE {i} ===\n{body}\n"

            parts.append(page_text)

        out_txt = "\n".join(parts).strip() + "\n"
        (OUTPUT_DIR / (pdf_path.stem + ".txt")).write_text(out_txt, encoding="utf-8")
        total_pages += len(doc)
    except Exception as e:
        print(f"[ERROR] {pdf_path.name}: {e}")

print(f"Done. Wrote {len(pdfs)} files to {OUTPUT_DIR}, {total_pages} pages total.")


Found 93 PDFs


Extracting: 100%|██████████| 93/93 [01:15<00:00,  1.24it/s]

Done. Wrote 93 files to /content/drive/MyDrive/Project-AI/Data/raw_text_data_2, 261 pages total.





In [None]:
OUTPUT_DIR = Path("/content/drive/MyDrive/Project-AI/Data/raw_text_data_3")

In [None]:
!pip -q install pymupdf scikit-learn pdfplumber

import fitz, re
from sklearn.cluster import KMeans
import pdfplumber
import itertools

def _clean_spaces(s: str) -> str:
    s = s.replace('\r', '')
    s = re.sub(r'[ \t]+', ' ', s)
    s = re.sub(r'\n{3,}', '\n\n', s)
    return s.strip()

def extract_page_text_words_kmeans(page: "fitz.Page", join_paragraph_gap=6.0) -> str:
    """
    Dùng word-level để tái tạo văn bản 2 cột → 1 cột (trái rồi phải).
    join_paragraph_gap: nếu khoảng cách y giữa các dòng > gap ⇒ chèn dòng trống.
    """
    W = page.rect.width
    words = page.get_text("words")  # [(x0,y0,x1,y1,"word", block,line,word_no), ...]
    if not words:
        return _clean_spaces(page.get_text("text") or "")

    # Loại bỏ từ trống, giữ bbox + token
    items = []
    for w in words:
        if len(w) >= 8 and w[4].strip():
            x0,y0,x1,y1,token,blk,ln,wn = w[:8]
            xmid = (x0 + x1)/2.0
            items.append({"x0":x0,"y0":y0,"x1":x1,"y1":y1,"t":token,"blk":blk,"ln":ln,"wn":wn,"xmid":xmid})

    if not items:
        return _clean_spaces(page.get_text("text") or "")

    # Phân cụm 2 cột theo xmid (nếu thực ra 1 cột thì 2 cụm sẽ chồng nhau → fallback)
    X = [[it["xmid"]] for it in items]
    kmeans = KMeans(n_clusters=2, n_init=10, random_state=0).fit(X)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_.flatten().tolist()
    if abs(centers[0]-centers[1]) < 0.1*W:  # coi như 1 cột
        labels = [0]*len(items)
        centers = [centers[0], centers[0]]

    left_label = centers.index(min(centers))
    right_label = 1 - left_label

    left  = [it for it,lb in zip(items,labels) if lb==left_label]
    right = [it for it,lb in zip(items,labels) if lb==right_label]

    def compose(col_items):
        # nhóm theo (blk, ln) để tạo "dòng"; trong dòng sắp theo x0
        lines = {}
        for it in col_items:
            key = (it["blk"], it["ln"])
            lines.setdefault(key, []).append(it)
        # sắp các dòng theo y0 (và blk) để giữ thứ tự trên xuống
        ordered = sorted(lines.items(), key=lambda kv: (min(x["y0"] for x in kv[1]), kv[0][0], kv[0][1]))
        out_lines, prev_y = [], None
        for _, arr in ordered:
            arr.sort(key=lambda x: x["x0"])
            text = " ".join(tok["t"] for tok in arr)
            y = min(x["y0"] for x in arr)
            if prev_y is not None and abs(y - prev_y) > join_paragraph_gap:
                out_lines.append("")  # dòng trống giữa đoạn
            out_lines.append(text)
            prev_y = y
        # gộp các dòng, làm sạch nhẹ
        return _clean_spaces("\n".join(out_lines))

    left_text  = compose(left)  if left  else ""
    right_text = compose(right) if right else ""
    # Nếu chỉ 1 cột thực sự
    if not right_text or not left_text:
        return _clean_spaces(page.get_text("text") or (left_text + "\n" + right_text))

    return (left_text + "\n\n" + right_text).strip()


In [None]:
def _to_markdown_table(rows):
    cols = list(itertools.zip_longest(*rows, fillvalue=""))
    keep_idx = [j for j,col in enumerate(cols) if any((c or "").strip() for c in col)]
    cleaned = [[(r[j] or "").strip() for j in keep_idx] for r in rows if r]
    if not cleaned: return None
    header = cleaned[0]
    if sum(1 for c in header if c) < max(1, len(header)//2):
        header = [f"Col{j+1}" for j in range(len(header))]
        body = cleaned
    else:
        body = cleaned[1:]

    def mdrow(r): return "| " + " | ".join(r) + " |"
    return "\n".join([mdrow(header), "| " + " | ".join(["---"]*len(header)) + " |"] + [mdrow(r) for r in body])

def extract_tables_markdown_for_page_safe(pdf_path, page_index, min_cells=9):
    md_sections = []
    with pdfplumber.open(pdf_path) as pdf:
        if page_index >= len(pdf.pages): return ""
        p = pdf.pages[page_index]
        page_text = (p.extract_text() or "").lower()
        # chỉ thử nếu có tín hiệu là bảng
        if ("table" not in page_text) and ("laboratory" not in page_text):
            return ""

        # thử lines trước, rồi stream
        candidates = []
        for settings in (
            {"vertical_strategy":"lines","horizontal_strategy":"lines","intersection_tolerance":5},
            {"vertical_strategy":"text","horizontal_strategy":"text","snap_tolerance":3,"join_tolerance":3,
             "edge_min_length":20,"min_words_vertical":2,"min_words_horizontal":2}
        ):
            tables = p.extract_tables(table_settings=settings) or []
            for rows in tables:
                # lọc: số ô có chữ + ít nhất 3 cột
                cell_count = sum(1 for r in rows for c in r if (c or "").strip())
                ncols = max(len(r) for r in rows) if rows else 0
                hdr = " ".join((rows[0] or [])).lower() if rows else ""
                looks_lab = any(k in hdr for k in ["parameter","patient","reference","range"])
                if cell_count >= min_cells and ncols >= 3 and looks_lab:
                    candidates.append(rows)

        for idx, rows in enumerate(candidates, 1):
            md = _to_markdown_table(rows)
            if md:
                md_sections.append(f"**[Extracted Table {idx}]**\n{md}")
    return "\n\n".join(md_sections)


In [None]:
from tqdm import tqdm
from pathlib import Path

pdfs = sorted([p for p in INPUT_DIR.glob("*.pdf") if p.is_file()])
print(f"Found {len(pdfs)} PDFs")

total_pages = 0
for pdf_path in tqdm(pdfs, desc="Extracting"):
    try:
        doc = fitz.open(pdf_path)
        parts = []
        for i, page in enumerate(doc, 1):
            # văn bản 2 cột → 1 cột bằng words
            body = extract_page_text_words_kmeans(page)
            if len(body) < 50:
                body = (page.get_text("text") or "").strip()

            # bảng: chỉ khi chắc chắn là bảng labs
            tables_md = extract_tables_markdown_for_page_safe(str(pdf_path), i-1)
            if tables_md:
                page_text = f"=== PAGE {i} ===\n{body}\n\n[Tables]\n{tables_md}\n"
            else:
                page_text = f"=== PAGE {i} ===\n{body}\n"

            parts.append(page_text)

        out_txt = "\n".join(parts).strip() + "\n"
        (OUTPUT_DIR / (pdf_path.stem + ".txt")).write_text(out_txt, encoding="utf-8")
        total_pages += len(doc)
    except Exception as e:
        print(f"[ERROR] {pdf_path.name}: {e}")

print(f"Done. Wrote {len(pdfs)} files to {OUTPUT_DIR}, {total_pages} pages total.")


Found 93 PDFs


Extracting:   1%|          | 1/93 [00:00<00:47,  1.92it/s]

[ERROR] 1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   2%|▏         | 2/93 [00:00<00:41,  2.21it/s]

[ERROR] 10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   3%|▎         | 3/93 [00:03<01:58,  1.32s/it]

[ERROR] 11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   5%|▌         | 5/93 [00:06<02:04,  1.42s/it]

[ERROR] 13---A-16-Year-Old-Girl-from-Malawi-With-Fever_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   6%|▋         | 6/93 [00:07<01:34,  1.08s/it]

[ERROR] 14---A-22-Year-Old-Woman-from-Bangladesh-With-P_2022_Clinical-Cases-in-Tropi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   8%|▊         | 7/93 [00:07<01:24,  1.02it/s]

[ERROR] 15---A-3-Year-Old-Boy-from-Laos-With-Right-Sup_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:   9%|▊         | 8/93 [00:08<01:15,  1.13it/s]

[ERROR] 16---A-25-Year-Old-Female-School-Teacher-from-Malaw_2022_Clinical-Cases-in-T.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  13%|█▎        | 12/93 [00:15<02:07,  1.57s/it]

[ERROR] 20---A-43-Year-Old-Male-Traveller-Returning-from-M_2022_Clinical-Cases-in-Tr.pdf: sequence item 0: expected str instance, NoneType found


Extracting:  14%|█▍        | 13/93 [00:16<01:55,  1.45s/it]

[ERROR] 21---A-35-Year-Old-American-Man-With-Fatigue-_2022_Clinical-Cases-in-Tropica.pdf: sequence item 3: expected str instance, NoneType found


Extracting:  15%|█▌        | 14/93 [00:17<01:30,  1.15s/it]

[ERROR] 22---32-Year-Old-Woman-from-Nigeria-With-Jaund_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  16%|█▌        | 15/93 [00:17<01:10,  1.11it/s]

[ERROR] 23---A-31-Year-Old-HIV-Positive-Business-Traveller-W_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  19%|█▉        | 18/93 [00:20<00:58,  1.28it/s]

[ERROR] 26---A-14-Year-Old-Boy-from-Malawi-Who-Has-Bee_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  20%|██        | 19/93 [00:22<01:37,  1.31s/it]

[ERROR] 27---A-16-Year-Old-Boy-from-Sri-Lanka-With-Fever_2022_Clinical-Cases-in-Trop.pdf: sequence item 3: expected str instance, NoneType found


Extracting:  22%|██▏       | 20/93 [00:22<01:13,  1.01s/it]

[ERROR] 28---A-67-Year-Old-Female-Expatriate-Living-in-Came_2022_Clinical-Cases-in-T.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  23%|██▎       | 21/93 [00:23<01:03,  1.14it/s]

[ERROR] 29---A-35-Year-Old-Woman-from-Malawi-With-Feve_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  24%|██▎       | 22/93 [00:23<00:49,  1.45it/s]

[ERROR] 3---A-26-Year-Old-Woman-from-Malawi-with-Headache-_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  25%|██▍       | 23/93 [00:24<00:39,  1.76it/s]

[ERROR] 30---A-12-Year-Old-Boy-from-Rural-Kenya-With_2022_Clinical-Cases-in-Tropical.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  27%|██▋       | 25/93 [00:25<00:52,  1.29it/s]

[ERROR] 32---A-44-Year-Old-Male-Farmer-from-Laos-With-Di_2022_Clinical-Cases-in-Trop.pdf: sequence item 3: expected str instance, NoneType found


Extracting:  29%|██▉       | 27/93 [00:27<00:46,  1.41it/s]

[ERROR] 33---A-53-Year-Old-Man-from-Malawi-With-a-C_2022_Clinical-Cases-in-Tropical-.pdf: sequence item 3: expected str instance, NoneType found
[ERROR] 34---A-35-Year-Old-Male-Farmer-from-Peru-With-a-Chro_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  31%|███       | 29/93 [00:28<00:44,  1.43it/s]

[ERROR] 36---A-23-Year-Old-Farmer-from-Myanmar-With-Uni_2022_Clinical-Cases-in-Tropi.pdf: sequence item 3: expected str instance, NoneType found


Extracting:  33%|███▎      | 31/93 [00:29<00:29,  2.10it/s]

[ERROR] 37---A-29-Year-Old-Woman-from-Malawi-With-Confusi_2022_Clinical-Cases-in-Tro.pdf: sequence item 0: expected str instance, NoneType found
[ERROR] 38---A-24-Year-Old-Female-Globetrotter-With-Strange_2022_Clinical-Cases-in-T.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  35%|███▌      | 33/93 [00:29<00:19,  3.12it/s]

[ERROR] 39---A-30-Year-Old-Male-Chinese-Trader-With-_2022_Clinical-Cases-in-Tropical.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 4---A-4-Year-Old-Girl-from-Uganda-in-a-_2022_Clinical-Cases-in-Tropical-Medi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  38%|███▊      | 35/93 [00:30<00:21,  2.67it/s]

[ERROR] 41---A-7-Year-Old-Girl-from-West-Africa-With-Two-Ski_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  42%|████▏     | 39/93 [00:31<00:15,  3.38it/s]

[ERROR] 44---A-7-Year-Old-Girl-from-South-Sudan-With-_2022_Clinical-Cases-in-Tropica.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 45---A-2-Month-Old-Girl-from-Laos-With-Dyspnoea-_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  43%|████▎     | 40/93 [00:31<00:13,  3.86it/s]

[ERROR] 46---A-45-Year-Old-Man-from-Sri-Lanka-With-Fever-_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  45%|████▌     | 42/93 [00:32<00:12,  3.96it/s]

[ERROR] 47---A-32-Year-Old-Man-from-Malawi-With-a-Pain_2022_Clinical-Cases-in-Tropic.pdf: sequence item 3: expected str instance, NoneType found
[ERROR] 48---A-31-Year-Old-Woman-from-Tanzania-With-Acu_2022_Clinical-Cases-in-Tropi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  47%|████▋     | 44/93 [00:33<00:11,  4.28it/s]

[ERROR] 49---A-33-Year-Old-Male-Traveller-to-India-With-Di_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 5---A-4-Year-Old-Boy-from-Laos-With-a-Lesion-o_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  49%|████▉     | 46/93 [00:33<00:08,  5.32it/s]

[ERROR] 50---A-24-Year-Old-Man-of-Turkish-Origin-With-Jau_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 51---A-34-Year-Old-HIV-Positive-Woman-from-Malawi-W_2022_Clinical-Cases-in-T.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  52%|█████▏    | 48/93 [00:33<00:07,  5.73it/s]

[ERROR] 52---A-56-Year-Old-Man-from-Peru-With-Prolonged-_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  53%|█████▎    | 49/93 [00:33<00:09,  4.49it/s]

[ERROR] 54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-.pdf: sequence item 0: expected str instance, NoneType found


Extracting:  54%|█████▍    | 50/93 [00:34<00:12,  3.46it/s]

[ERROR] 55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  56%|█████▌    | 52/93 [00:34<00:10,  3.91it/s]

[ERROR] 56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  59%|█████▉    | 55/93 [00:36<00:14,  2.64it/s]

[ERROR] 6---A-36-Year-Old-Male-Traveller-Returning-from-B_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  61%|██████▏   | 57/93 [00:36<00:10,  3.45it/s]

[ERROR] 61---A-48-Year-Old-Woman-from-Thailand-With-Fever-_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  62%|██████▏   | 58/93 [00:37<00:13,  2.67it/s]

[ERROR] 62---A-28-Year-Old-Man-from-Ghana-With-a-Chron_2022_Clinical-Cases-in-Tropic.pdf: sequence item 0: expected str instance, NoneType found


Extracting:  65%|██████▍   | 60/93 [00:38<00:15,  2.07it/s]

[ERROR] 64---A-40-Year-Old-Woman-from-Thailand-and-Her-Bro_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  67%|██████▋   | 62/93 [00:39<00:12,  2.55it/s]

[ERROR] 66---A-32-Year-Old-Man-from-Malawi-With-Pain-in-the-_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  68%|██████▊   | 63/93 [00:39<00:10,  2.85it/s]

[ERROR] 67---A-24-Year-Old-Woman-from-the-Peruvian-Andes-_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  69%|██████▉   | 64/93 [00:40<00:16,  1.79it/s]

[ERROR] 68---A-31-Year-Old-Woman-from-Malawi-With-a-Gene_2022_Clinical-Cases-in-Trop.pdf: sequence item 3: expected str instance, NoneType found


Extracting:  70%|██████▉   | 65/93 [00:40<00:13,  2.15it/s]

[ERROR] 69---A-22-Year-Old-Male-Farmer-from-Rural-Ethiop_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  73%|███████▎  | 68/93 [00:42<00:08,  2.80it/s]

[ERROR] 70---A-58-Year-Old-Woman-from-Sri-Lanka-With-Fev_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 71---A-71-Year-Old-Man-from-Japan-With-Eosinophili_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  75%|███████▌  | 70/93 [00:42<00:08,  2.78it/s]

[ERROR] 72---A-4-Year-Old-Boy-from-Mozambique-With-Sever_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 73---A-21-Year-Old-Male-Migrant-from-Rural-Mali-_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  77%|███████▋  | 72/93 [00:43<00:07,  2.85it/s]

[ERROR] 74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop.pdf: sequence item 3: expected str instance, NoneType found
[ERROR] 75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  78%|███████▊  | 73/93 [00:43<00:05,  3.39it/s]

[ERROR] 76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  81%|████████  | 75/93 [00:44<00:04,  3.66it/s]

[ERROR] 77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  82%|████████▏ | 76/93 [00:44<00:05,  3.24it/s]

[ERROR] 79---A-34-Year-Old-Male-Immigrant-from-Peru-With-Ch_2022_Clinical-Cases-in-T.pdf: sequence item 0: expected str instance, NoneType found


Extracting:  85%|████████▍ | 79/93 [00:45<00:03,  4.41it/s]

[ERROR] 80---A-62-Year-Old-Man-from-Thailand-With-a_2022_Clinical-Cases-in-Tropical-.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 81---A-33-Year-Old-Refugee-from-Afghanistan-With-_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  86%|████████▌ | 80/93 [00:45<00:02,  4.54it/s]

[ERROR] 82---A-31-Year-Old-Man-from-Guatemala-With-Acute-_2022_Clinical-Cases-in-Tro.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  88%|████████▊ | 82/93 [00:46<00:03,  2.98it/s]

[ERROR] 83---An-18-Year-Old-Man-from-India-With-a-Pale-P_2022_Clinical-Cases-in-Trop.pdf: sequence item 0: expected str instance, NoneType found
[ERROR] 84---A-64-Year-Old-Japanese-Man-With-Generalize_2022_Clinical-Cases-in-Tropi.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  92%|█████████▏| 86/93 [00:48<00:02,  2.94it/s]

[ERROR] 87---A-27-Year-Old-Male-Traveller-Returning-from-the_2022_Clinical-Cases-in-.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 88---A-74-Year-Old-Man-from-Japan-With-Fever--_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  95%|█████████▍| 88/93 [00:48<00:01,  3.93it/s]

[ERROR] 89---A-30-Year-Old-Woman-from-Bolivia-With-Ex_2022_Clinical-Cases-in-Tropica.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 9---A-52-Year-Old-Man-from-Vietnam-With-Evo_2022_Clinical-Cases-in-Tropical-.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  97%|█████████▋| 90/93 [00:48<00:00,  4.90it/s]

[ERROR] 90---A-55-Year-Old-Couple-Both-Returning-from-Chile-a_2022_Clinical-Cases-in.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 91---A-20-Year-Old-Male-from-India-With-Fever-_2022_Clinical-Cases-in-Tropic.pdf: sequence item 1: expected str instance, NoneType found


Extracting:  98%|█████████▊| 91/93 [00:49<00:00,  4.99it/s]

[ERROR] 92---A-42-Year-Old-Traveller-Returning-from-Thaila_2022_Clinical-Cases-in-Tr.pdf: sequence item 1: expected str instance, NoneType found


Extracting: 100%|██████████| 93/93 [00:49<00:00,  1.87it/s]

[ERROR] 93---A-35-Year-Old-Male-Logger-from-Peru-With-Fe_2022_Clinical-Cases-in-Trop.pdf: sequence item 1: expected str instance, NoneType found
[ERROR] 94---A-20-Year-Old-Woman-from-the-Democratic-Republic-_2022_Clinical-Cases-i.pdf: sequence item 1: expected str instance, NoneType found
Done. Wrote 93 files to /content/drive/MyDrive/Project-AI/Data/raw_text_data_3, 51 pages total.





#copy disease images to another dir

In [None]:
from pathlib import Path
import shutil

SRC_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")
DST_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/diseases_extracted_images")

# cấu hình
EXTS = {".png"}          # thêm {'.jpg', '.jpeg'} nếu cần
OVERWRITE = True         # False = bỏ qua nếu file đã tồn tại

DST_ROOT.mkdir(parents=True, exist_ok=True)

cases = sorted([p for p in SRC_ROOT.iterdir() if p.is_dir()])
total_imgs = 0
processed_cases = 0

for case_dir in cases:
    src_img_dir = case_dir / "images"
    if not src_img_dir.exists():
        continue

    dst_case_dir = DST_ROOT / case_dir.name
    dst_case_dir.mkdir(parents=True, exist_ok=True)

    imgs = sorted([p for p in src_img_dir.iterdir() if p.is_file() and p.suffix.lower() in EXTS])
    copied = 0
    for img in imgs:
        dst_path = dst_case_dir / img.name
        if dst_path.exists() and not OVERWRITE:
            continue
        shutil.copy2(img, dst_path)  # giữ metadata thời gian
        copied += 1
        total_imgs += 1

    processed_cases += 1
    print(f"[{case_dir.name}] copied {copied} files -> {dst_case_dir}")

print(f"\nDone. Processed {processed_cases} case folders, copied {total_imgs} images to {DST_ROOT}")


[001] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/001
[002] copied 3 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/002
[003] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/003
[004] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/004
[005] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/005
[006] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/006
[007] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/007
[008] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/008
[009] copied 0 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/009
[010] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/diseases_extracted_images/010
[011] copied 2 files -> /content/drive/MyDrive/Project-AI/Data/disease

# copy raw_text_data to extracted_data dir


In [None]:
from pathlib import Path
import shutil, re

SRC_DIR = Path("/content/drive/MyDrive/Project-AI/Data/raw_text_data")
DST_ROOT = Path("/content/drive/MyDrive/Project-AI/Data/extracted_data")

# Chỉ copy, không move
OVERWRITE = True   # đặt False nếu không muốn ghi đè khi file đã tồn tại

# regex: lấy số ở đầu trước dấu -- hoặc ---
PREFIX_RE = re.compile(r"^(\d+)\s*[-]{2,}")

txt_files = sorted([p for p in SRC_DIR.glob("*.txt") if p.is_file()])

copied, skipped, errors = 0, 0, 0
for src in txt_files:
    m = PREFIX_RE.match(src.name)
    if not m:
        print(f"[SKIP] No numeric prefix: {src.name}")
        skipped += 1
        continue

    case_num = int(m.group(1))
    case3 = f"{case_num:03d}"

    dst_dir = DST_ROOT / case3 / "raw_text_data"
    dst_dir.mkdir(parents=True, exist_ok=True)

    dst_file = dst_dir / f"{case3}_raw_text.txt"

    try:
        if dst_file.exists() and not OVERWRITE:
            print(f"[SKIP] Exists: {dst_file}")
            skipped += 1
        else:
            shutil.copy2(src, dst_file)  # COPY (giữ metadata), không xoá nguồn
            copied += 1
            print(f"[OK] {src.name} -> {dst_file}")
    except Exception as e:
        errors += 1
        print(f"[ERROR] {src.name}: {e}")

print(f"\nDone. Copied: {copied}, Skipped: {skipped}, Errors: {errors}")
print(f"Destination root: {DST_ROOT}")


[OK] 1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.txt -> /content/drive/MyDrive/Project-AI/Data/extracted_data/001/raw_text_data/001_raw_text.txt
[OK] 10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.txt -> /content/drive/MyDrive/Project-AI/Data/extracted_data/010/raw_text_data/010_raw_text.txt
[OK] 11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T.txt -> /content/drive/MyDrive/Project-AI/Data/extracted_data/011/raw_text_data/011_raw_text.txt
[OK] 12---A-29-Year-Old-Man-from-The-Gambia-With-G_2022_Clinical-Cases-in-Tropica.txt -> /content/drive/MyDrive/Project-AI/Data/extracted_data/012/raw_text_data/012_raw_text.txt
[OK] 13---A-16-Year-Old-Girl-from-Malawi-With-Fever_2022_Clinical-Cases-in-Tropic.txt -> /content/drive/MyDrive/Project-AI/Data/extracted_data/013/raw_text_data/013_raw_text.txt
[OK] 14---A-22-Year-Old-Woman-from-Bangladesh-With-P_2022_Clinical-Cases-in-Tropi.txt -> /content/drive/MyDriv

# Recreate manifest.json from pdf-files

In [None]:
# ============== Rebuild manifests for all cases ==============
from pathlib import Path
from datetime import datetime, timezone
from PIL import Image
import hashlib, json, re

DATA_ROOT      = Path("/content/drive/MyDrive/Project-AI/Data")
EXTRACTED_DIR  = DATA_ROOT / "extracted_data"
PDF_DATA_DIR   = DATA_ROOT / "pdf-data"   # có thể thiếu -> OK

def _now_iso():
    return datetime.now(timezone.utc).isoformat()

def _sha256(path: Path):
    try:
        h = hashlib.sha256()
        with open(path, "rb") as f:
            for chunk in iter(lambda: f.read(1<<20), b""):
                h.update(chunk)
        return h.hexdigest()
    except Exception:
        return None

def _find_pdf(case_id: str):
    """Tìm PDF theo pattern 'NNN---*.pdf'. Trả về (path|None, title|None, size|None, sha|None)."""
    try:
        pat = f"{int(case_id):03d}---*.pdf"
    except:
        pat = f"{case_id}---*.pdf"
    files = sorted(PDF_DATA_DIR.glob(pat)) if PDF_DATA_DIR.exists() else []
    if not files:
        return None, None, None, None
    pdf = files[0]
    # derive title từ tên file
    title = re.sub(r"^\d+\s*---\s*", "", pdf.stem)
    title = title.replace("_", " ")
    title = re.sub(r"\s+", " ", title).strip(" -_")
    try:
        size = pdf.stat().st_size
    except Exception:
        size = None
    sha = _sha256(pdf)
    return str(pdf), title, size, sha

def _images_meta(images_dir: Path):
    metas = []
    if not images_dir.exists():
        return metas
    for img in sorted(images_dir.glob("*.png")):
        try:
            with Image.open(img) as im:
                w, h = im.size
        except Exception:
            w = h = None
        metas.append({
            "file": str(img),
            "width": w, "height": h,
            "figure_id": img.stem,
        })
    return metas

def rebuild_manifests():
    cases = sorted([p for p in EXTRACTED_DIR.iterdir() if p.is_dir() and p.name.isdigit()])
    print(f"Found {len(cases)} case folders in {EXTRACTED_DIR}")
    ok = 0
    for case_dir in cases:
        cid = case_dir.name
        pages_dir = case_dir / "pages"
        images_dir = case_dir / "images"
        text_dir   = case_dir / "raw_text_data"

        page_files = sorted(pages_dir.glob("*.png")) if pages_dir.exists() else []
        n_pages = len(page_files)

        source_pdf, case_title, pdf_size, pdf_sha = _find_pdf(cid)

        manifest = {
            "case_id": cid,
            "case_title": case_title,                 # None nếu không tìm ra PDF
            "source_pdf": source_pdf,                 # None nếu không có
            "pages_dir": str(pages_dir) if pages_dir.exists() else None,
            "images_dir": str(images_dir) if images_dir.exists() else None,
            "text_dir":   str(text_dir) if text_dir.exists() else None,
            "n_pages": n_pages,
            "page_files": [str(p) for p in page_files],          # để truy vết nhanh
            "image_files": [str(p) for p in sorted(images_dir.glob('*.png'))] if images_dir.exists() else [],
            "text_files":  [str(p) for p in sorted(text_dir.glob('*.txt'))] if text_dir.exists() else [],
            "images_meta": _images_meta(images_dir),
            "pdf_filesize": pdf_size,
            "pdf_sha256": pdf_sha,
            "created_at": _now_iso(),
            "updated_at": _now_iso(),
            # giữ chỗ cho pipeline sau (không có thì để None/[])
            "extract_model": None,
            "prompt_version": None,
            "errors": []
        }

        out = case_dir / "manifest.json"
        out.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
        ok += 1
        print(f"✓ {cid} → {out}")

    print(f"\nDONE: rebuilt {ok}/{len(cases)} manifests.")

rebuild_manifests()


Found 93 case folders in /content/drive/MyDrive/Project-AI/Data/extracted_data
✓ 001 → /content/drive/MyDrive/Project-AI/Data/extracted_data/001/manifest.json
✓ 002 → /content/drive/MyDrive/Project-AI/Data/extracted_data/002/manifest.json
✓ 003 → /content/drive/MyDrive/Project-AI/Data/extracted_data/003/manifest.json
✓ 004 → /content/drive/MyDrive/Project-AI/Data/extracted_data/004/manifest.json
✓ 005 → /content/drive/MyDrive/Project-AI/Data/extracted_data/005/manifest.json
✓ 006 → /content/drive/MyDrive/Project-AI/Data/extracted_data/006/manifest.json
✓ 007 → /content/drive/MyDrive/Project-AI/Data/extracted_data/007/manifest.json
✓ 008 → /content/drive/MyDrive/Project-AI/Data/extracted_data/008/manifest.json
✓ 009 → /content/drive/MyDrive/Project-AI/Data/extracted_data/009/manifest.json
✓ 010 → /content/drive/MyDrive/Project-AI/Data/extracted_data/010/manifest.json
✓ 011 → /content/drive/MyDrive/Project-AI/Data/extracted_data/011/manifest.json
✓ 012 → /content/drive/MyDrive/Project-AI

In [None]:
# ==== CONFIG ====
from pathlib import Path

# Paths
DATA_ROOT      = Path("/content/drive/MyDrive/Project-AI/Data")
EXTRACTED_DIR  = DATA_ROOT / "extracted_data"


NGROK_BASE_URL   = "https://tammara-stalkless-portentously.ngrok-free.dev"
OPENAI_BASE_URL  = f"{NGROK_BASE_URL}/v1"
OPENAI_API_KEY   = "lm-studio"
MODEL_NAME       = "qwen2-vl-7b-instruct"


PROMPT_VERSION   = "tropid-json-v3-no-fewshot"

print("CONFIG loaded:")
print("  DATA_ROOT        =", DATA_ROOT)
print("  EXTRACTED_DIR    =", EXTRACTED_DIR)
print("  OPENAI_BASE_URL  =", OPENAI_BASE_URL)
print("  MODEL_NAME       =", MODEL_NAME)
print("  PROMPT_VERSION   =", PROMPT_VERSION)


CONFIG loaded:
  DATA_ROOT        = /content/drive/MyDrive/Project-AI/Data
  EXTRACTED_DIR    = /content/drive/MyDrive/Project-AI/Data/extracted_data
  OPENAI_BASE_URL  = https://tammara-stalkless-portentously.ngrok-free.dev/v1
  MODEL_NAME       = qwen2-vl-7b-instruct
  PROMPT_VERSION   = tropid-json-v3-no-fewshot


In [None]:
from pathlib import Path
from datetime import datetime, timezone
import json, re

DATA_ROOT     = Path("/content/drive/MyDrive/Project-AI/Data")
EXTRACTED_DIR = DATA_ROOT / "extracted_data"
PDF_DATA_DIR  = DATA_ROOT / "pdf-data"

def _now_iso():
    return datetime.now(timezone.utc).isoformat()

def _find_pdf(case_id: str) -> Path | None:
    """Tìm PDF theo cả 2 kiểu tên: 001---*.pdf và 1---*.pdf"""
    n = int(case_id)                       # "001" -> 1
    pats = []
    pats += sorted(PDF_DATA_DIR.glob(f"{n:03d}---*.pdf"))  # 001---*.pdf
    pats += sorted(PDF_DATA_DIR.glob(f"{n}---*.pdf"))      # 1---*.pdf
    return pats[0] if pats else None

def _title_from_pdf_name(pdf: Path) -> str:
    stem = pdf.stem
    stem = re.sub(r"^\d+\s*---\s*", "", stem)
    stem = stem.replace("_", " ")
    stem = re.sub(r"-{2,}", "-", stem).strip("- _")
    stem = stem.replace("-", " ")
    stem = re.sub(r"\s{2,}", " ", stem).strip()
    return stem

BAD_TITLE_PATTERNS = [
    r"^=+\s*page\s*\d+\s*=+$",
    r"^\s*case\s*\d+(\s*[:\-].*)?$",
    r"^[\-\s_]+$",
]

def _looks_bad_title(title: str) -> bool:
    t = title.strip()
    for pat in BAD_TITLE_PATTERNS:
        if re.match(pat, t, flags=re.I):
            return True
    return False

patched = 0
for case_dir in sorted([p for p in EXTRACTED_DIR.iterdir() if p.is_dir() and p.name.isdigit()]):
    mpath = case_dir / "manifest.json"
    if not mpath.exists():
        continue

    m = json.loads(mpath.read_text(encoding="utf-8"))
    cid = case_dir.name


    pdf_path = None
    if m.get("source_pdf"):
        p = Path(m["source_pdf"])
        if p.exists():
            pdf_path = p
    if pdf_path is None:
        pdf = _find_pdf(cid)
        if pdf:
            pdf_path = pdf.resolve()
            m["source_pdf"] = str(pdf_path)


    if pdf_path:
        new_title = _title_from_pdf_name(Path(pdf_path))
        cur_title = m.get("case_title")
        if (cur_title is None) or _looks_bad_title(str(cur_title)):
            m["case_title"] = new_title


    for k in ("pages_dir","images_dir","text_dir"):
        if m.get(k):
            m[k] = str(Path(m[k]).resolve())
    for k in ("page_files","image_files","text_files"):
        if isinstance(m.get(k), list):
            m[k] = [str(Path(x).resolve()) for x in m[k]]
    m["updated_at"] = _now_iso()

    mpath.write_text(json.dumps(m, ensure_ascii=False, indent=2), encoding="utf-8")
    patched += 1
    print(f"[OK] {cid}: title = {m.get('case_title')} | pdf = {m.get('source_pdf')}")

print(f"Patched {patched} manifest(s). ✅")


[OK] 001: title = A 20 Year Old Woman from Sudan With Fever 2022 Clinical Cases in Tropi | pdf = /content/drive/MyDrive/Project-AI/Data/pdf-data/1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf
[OK] 002: title = A 7 Year Old Girl from Peru With a Chron 2022 Clinical Cases in Tropical | pdf = /content/drive/MyDrive/Project-AI/Data/pdf-data/2---A-7-Year-Old-Girl-from-Peru-With-a-Chron_2022_Clinical-Cases-in-Tropical.pdf
[OK] 003: title = A 26 Year Old Woman from Malawi with Headache 2022 Clinical Cases in Tr | pdf = /content/drive/MyDrive/Project-AI/Data/pdf-data/3---A-26-Year-Old-Woman-from-Malawi-with-Headache-_2022_Clinical-Cases-in-Tr.pdf
[OK] 004: title = A 4 Year Old Girl from Uganda in a 2022 Clinical Cases in Tropical Medi | pdf = /content/drive/MyDrive/Project-AI/Data/pdf-data/4---A-4-Year-Old-Girl-from-Uganda-in-a-_2022_Clinical-Cases-in-Tropical-Medi.pdf
[OK] 005: title = A 4 Year Old Boy from Laos With a Lesion o 2022 Clinical Cases in Tropic |

# Chunk, embedding and upsert

In [9]:
!pip -q install "qdrant-client>=1.7.3,<2" sentence-transformers "open-clip-torch>=2.24.0" pillow tqdm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/337.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, json, re, uuid, math
from pathlib import Path
from typing import List, Dict, Any, Iterable
from tqdm import tqdm
from PIL import Image

from qdrant_client import QdrantClient
from qdrant_client.http import models as qm

import torch
from sentence_transformers import SentenceTransformer, util as sutil
import open_clip

# ====== CONFIG ======
DATA_ROOT      = Path("/content/drive/MyDrive/Project-AI/Data")
EXTRACTED_DIR  = DATA_ROOT / "extracted_data"


QDRANT_URL     = os.getenv("QDRANT_URL", "http://165.22.56.15:6333")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "")

TEXT_MODEL_NAME = "BAAI/bge-m3"
# Image encoder (for cases_image) – CLIP
CLIP_BACKBONE   = "ViT-L-14"
CLIP_PRETRAINED = "laion2b_s32b_b82k"

# Chunking
CHUNK_SIZE   = 900
CHUNK_OVERLAP = 150

# ====== Load encoders ======
device = "cuda" if torch.cuda.is_available() else "cpu"

text_model = SentenceTransformer(TEXT_MODEL_NAME, device=device)

with torch.no_grad():
    _probe = text_model.encode(["probe"], convert_to_tensor=True, normalize_embeddings=True)
TEXT_DIM = _probe.shape[-1]

clip_model, _, clip_preproc = open_clip.create_model_and_transforms(
    CLIP_BACKBONE, pretrained=CLIP_PRETRAINED, device=device
)
clip_tokenizer = open_clip.get_tokenizer(CLIP_BACKBONE)
with torch.no_grad():
    txt = clip_tokenizer(["probe"]).to(device)
    _ = clip_model.encode_text(txt)
    IMG_DIM = _.shape[-1]

# ====== Qdrant client ======
qc = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY or None)

def ensure_collection(name: str, dim: int):
    if name not in [c.name for c in qc.get_collections().collections]:
        qc.recreate_collection(
            collection_name=name,
            vectors_config=qm.VectorParams(size=dim, distance=qm.Distance.COSINE),
        )

# ====== Utils ======
def iter_manifests(root: Path) -> Iterable[Path]:
    for c in sorted([p for p in root.iterdir() if p.is_dir() and p.name.isdigit()]):
        mf = c / "manifest.json"
        if mf.exists():
            yield mf

def clean_spaces(s: str) -> str:
    return re.sub(r"\s{2,}", " ", s).strip()

def chunk_text(s: str, size: int=CHUNK_SIZE, overlap: int=CHUNK_OVERLAP) -> List[str]:
    s = clean_spaces(s)
    if len(s) <= size:
        return [s] if s else []
    chunks, i = [], 0
    while i < len(s):
        chunks.append(s[i:i+size])
        if i + size >= len(s): break
        i += size - overlap
    return chunks

def read_raw_text(txt_path: Path) -> str:
    return txt_path.read_text(encoding="utf-8", errors="ignore")

def embed_text_batch(sentences: List[str]) -> List[List[float]]:
    emb = text_model.encode(sentences, normalize_embeddings=True, convert_to_numpy=True, batch_size=64)
    return emb.tolist()

@torch.no_grad()
def embed_images(paths: List[Path]) -> List[List[float]]:
    vecs = []
    for p in paths:
        img = Image.open(p).convert("RGB")
        img = clip_preproc(img).unsqueeze(0).to(device)
        v = clip_model.encode_image(img)
        v = v / v.norm(dim=-1, keepdim=True)
        vecs.append(v.squeeze(0).cpu().tolist())
    return vecs

@torch.no_grad()
def embed_text_for_clip(queries: List[str]) -> List[List[float]]:
    toks = clip_tokenizer(queries).to(device)
    v = clip_model.encode_text(toks)
    v = v / v.norm(dim=-1, keepdim=True)
    return v.cpu().tolist()


In [None]:

ensure_collection("cases_text",  TEXT_DIM)
ensure_collection("cases_image", IMG_DIM)

text_points, image_points = [], []

for mf in iter_manifests(EXTRACTED_DIR):
    m = json.loads(mf.read_text(encoding="utf-8"))
    cid = m.get("case_id")
    title = m.get("case_title")
    src_pdf = m.get("source_pdf")

    # ===== TEXT CHUNKS =====
    txt_files = m.get("text_files") or []
    if txt_files:
        raw = read_raw_text(Path(txt_files[0]))
        chunks = chunk_text(raw)
        if chunks:
            vecs = embed_text_batch(chunks)
            for idx, (chunk, vec) in enumerate(zip(chunks, vecs)):
                text_points.append(
                    qm.PointStruct(
                        id=str(uuid.uuid4()),
                        vector=vec,
                        payload={
                            "case_id": cid,
                            "kind": "text",
                            "title": title,
                            "source_pdf": src_pdf,
                            "chunk_id": idx,
                            "text": chunk,
                            "text_path": str(txt_files[0]),
                            "manifest_path": str(mf),
                        },
                    )
                )

    # ===== DISEASE IMAGES =====
    img_files = [Path(p) for p in (m.get("image_files") or []) if Path(p).exists()]
    if img_files:
        ivecs = embed_images(img_files)
        for p, vec in zip(img_files, ivecs):
            image_points.append(
                qm.PointStruct(
                    id=str(uuid.uuid4()),
                    vector=vec,
                    payload={
                        "case_id": cid,
                        "kind": "image",
                        "title": title,
                        "source_pdf": src_pdf,
                        "image_path": str(p),
                        "manifest_path": str(mf),
                    },
                )
            )


def upsert_in_batches(collection: str, points: List[qm.PointStruct], batch=256):
    for i in tqdm(range(0, len(points), batch), desc=f"Upsert {collection}"):
        qc.upsert(collection, points=points[i:i+batch])

if text_points:
    upsert_in_batches("cases_text", text_points)
if image_points:
    upsert_in_batches("cases_image", image_points)

print(f"Done. text_points={len(text_points)}, image_points={len(image_points)}")


  qc.recreate_collection(
Upsert cases_text: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]
Upsert cases_image: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s]

Done. text_points=1108, image_points=147





In [None]:
from qdrant_client.http import models as qm

print("collections:", [c.name for c in qc.get_collections().collections])

info_text  = qc.get_collection("cases_text")
info_image = qc.get_collection("cases_image")
print("cases_text vectors:", info_text.points_count)
print("cases_image vectors:", info_image.points_count)


collections: ['cases_image', 'tropical_cases_chunks', 'cases_text']
cases_text vectors: 1108
cases_image vectors: 147


In [None]:
query = "acute fever with hemorrhage after travel to Uganda"
qvec  = embed_text_batch([query])[0]
hits  = qc.search("cases_text", query_vector=qvec, limit=5, with_payload=True)
for h in hits:
    print(f"{h.score:.3f}", h.payload["case_id"], h.payload.get("title"))
    print(h.payload["text"][:200], "...\n")


  hits  = qc.search("cases_text", query_vector=qvec, limit=5, with_payload=True)


0.660 053 A 24 Year Old Woman from Uganda With Fe 2022 Clinical Cases in Tropical
=== PAGE 1 ===
53
A 24-Year-Old Woman from
Uganda With Fever and Shock BENJAMIN JEFFS Clinical Presentation History A 24-year-old woman presents to a small hospital in rural
Uganda because of a 5-day  ...

0.639 001 A 20 Year Old Woman from Sudan With Fever 2022 Clinical Cases in Tropi
=== PAGE 1 ===
1
A 20-Year-Old Woman from Sudan
With Fever, Haemorrhage and Shock DANIEL G. BAUSCH Clinical Presentation History A 20-year-old housewife presents to a hospital in northern
Uganda with  ...

0.624 053 A 24 Year Old Woman from Uganda With Fe 2022 Clinical Cases in Tropical
ide or a rapid diagnostic test. However, the prevalence of Plasmodium falciparum parasitaemia in Uganda is high and a positive slide
would not rule out VHF. 147 === PAGE 2 ===
148 CHAPTER 53
A 24-Year ...

0.611 022 32 Year Old Woman from Nigeria With Jaund 2022 Clinical Cases in Tropic
=== PAGE 1 ===
22
32-Year-Old Woman from Nigeria With
J

In [None]:
q = "skin hemorrhagic rash"
qvec = embed_text_for_clip([q])[0]
hits = qc.search("cases_image", query_vector=qvec, limit=5, with_payload=True)
for h in hits:
    print(f"{h.score:.3f}", h.payload["case_id"], h.payload.get("title"), "→", h.payload["image_path"])


  hits = qc.search("cases_image", query_vector=qvec, limit=5, with_payload=True)


0.306 034 A 35 Year Old Male Farmer from Peru With a Chro 2022 Clinical Cases in → /content/drive/MyDrive/Project-AI/Data/extracted_data/034/images/034_p1_fig_1.png
0.300 085 A 55 Year Old Female Pig Farmer from Vietnam 2022 Clinical Cases in Tr → /content/drive/MyDrive/Project-AI/Data/extracted_data/085/images/085_p1_fig_2.png
0.296 052 A 56 Year Old Man from Peru With Prolonged 2022 Clinical Cases in Trop → /content/drive/MyDrive/Project-AI/Data/extracted_data/052/images/052_p2_fig_2.png
0.295 085 A 55 Year Old Female Pig Farmer from Vietnam 2022 Clinical Cases in Tr → /content/drive/MyDrive/Project-AI/Data/extracted_data/085/images/085_p1_fig_1.png
0.295 078 A 42 Year Old British Man Living in Malawi 2022 Clinical Cases in Tropi → /content/drive/MyDrive/Project-AI/Data/extracted_data/078/images/078_p2_fig_1.png


In [None]:
query = "fever with hemorrhage and travel to Uganda"
qvec  = embed_text_batch([query])[0]
hits  = qc.search(
    collection_name="cases_text",
    query_vector=qvec,
    limit=5,
    with_payload=True
)
for h in hits:
    print(h.score, h.payload["case_id"], h.payload.get("title"))
    print(h.payload["text"][:220], "...\n")


  hits  = qc.search(


0.6684202 053 A 24 Year Old Woman from Uganda With Fe 2022 Clinical Cases in Tropical
=== PAGE 1 ===
53
A 24-Year-Old Woman from
Uganda With Fever and Shock BENJAMIN JEFFS Clinical Presentation History A 24-year-old woman presents to a small hospital in rural
Uganda because of a 5-day history of a febrile ...

0.6555009 053 A 24 Year Old Woman from Uganda With Fe 2022 Clinical Cases in Tropical
ide or a rapid diagnostic test. However, the prevalence of Plasmodium falciparum parasitaemia in Uganda is high and a positive slide
would not rule out VHF. 147 === PAGE 2 ===
148 CHAPTER 53
A 24-Year-Old Woman from Ugan ...

0.63942707 001 A 20 Year Old Woman from Sudan With Fever 2022 Clinical Cases in Tropi
=== PAGE 1 ===
1
A 20-Year-Old Woman from Sudan
With Fever, Haemorrhage and Shock DANIEL G. BAUSCH Clinical Presentation History A 20-year-old housewife presents to a hospital in northern
Uganda with a 2-day history of f ...

0.6221497 053 A 24 Year Old Woman from Uganda With Fe 2022 Clini

# extract to 11 fields json using Gemini

In [3]:
# --- BƯỚC 1: CÀI ĐẶT VÀ XÁC THỰC ---
print("Đang cài đặt thư viện Vertex AI (google-cloud-aiplatform)...")
!pip install -q google-cloud-aiplatform

print("\nĐang cài đặt tqdm (thanh tiến trình)...")
!pip install -q tqdm

print("\n--- VUI LÒNG XÁC THỰC TÀI KHOẢN ---")
print("Một cửa sổ popup sẽ hiện lên. Vui lòng chọn tài khoản Google (có $300 credit) và nhấn 'Allow'.")

from google.colab import auth
auth.authenticate_user()

print("\nĐã xác thực thành công!")

Đang cài đặt thư viện Vertex AI (google-cloud-aiplatform)...

Đang cài đặt tqdm (thanh tiến trình)...

--- VUI LÒNG XÁC THỰC TÀI KHOẢN ---
Một cửa sổ popup sẽ hiện lên. Vui lòng chọn tài khoản Google (có $300 credit) và nhấn 'Allow'.

Đã xác thực thành công!


In [8]:
"""
Script Colab (phiên bản Vertex AI) để xử lý PDF và trích xuất JSON.
Script này sử dụng thư viện google-cloud-aiplatform,
tương thích với $300 credit từ Google Cloud.

Sử dụng xác thực của Colab (không cần API Key).

CẬP NHẬT: Đổi tên model từ 'gemini-1.5-flash-001' thành 'gemini-1.5-flash'
để sửa lỗi 404 'Publisher Model... not found'.
"""

import os
import glob
import json
import time
import pathlib
import mimetypes
from tqdm import tqdm

import vertexai
from vertexai.generative_models import GenerativeModel, Part

# ==============================================================================
# 1. CẤU HÌNH DỰ ÁN (PROJECT)
# ==============================================================================

# Lấy từ ảnh 'Screenshot 2025-11-05 at 19.36.22.png'
PROJECT_ID = "gen-lang-client-0264433577"

# Đây là khu vực phổ biến nhất, nếu lỗi, chúng ta có thể đổi
LOCATION = "us-central1"

# Khởi tạo Vertex AI
try:
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    print(f"Vertex AI đã khởi tạo thành công cho Project: {PROJECT_ID} tại {LOCATION}")
except Exception as e:
    print(f"!!! LỖI KHỞI TẠO Vertex AI: {e}")
    print("Có thể bạn cần BẬT 'Vertex AI API' trong Google Cloud Console cho project này.")
    print("Hãy truy cập link này và nhấn ENABLE:")
    print(f"https://console.cloud.google.com/apis/library/aiplatform.googleapis.com?project={PROJECT_ID}")

# Đường dẫn
PDF_DIR = "/content/drive/MyDrive/Project-AI/Data/pdf-data"
JSON_DIR = "/content/drive/MyDrive/Project-AI/Data/structure-data-json"

# 2. Prompt từ Mentor của bạn (Giữ nguyên)
SYSTEM_PROMPT = """# TropID Big-Chunk Case → JSON Extractor (Prompt)
## System
You are **TropID-Extractor**, an expert clinical information extractor for **tropical & infectious diseases**.
Your task: **read a free-text clinical case** and return a **single JSON object** where **each section is one coherent full-text block** (no bulletizing into tiny subfields).
If a section is not present, set it to `null`. **Do not guess.** **Output ONLY valid JSON** — no preamble, no commentary.
### Formatting & Safety Rules
- **One JSON object only.**
- **Full-text blocks:** Each field below must be a **cohesive paragraph** (or short multi-sentence block** stitched from the case text; paraphrase minimally, preserve clinical meaning, and **do not invent** missing details.
- **Attribution discipline:** Prefer exact phrases from the source for key facts (fever pattern, exposures, test names, titers) but keep the prose readable.
- **Units & names:** Keep units and proper names as written (°C/°F, NS1, thick smear, RDT, species, titers/CT values).
- **Privacy:** Exclude any direct identifiers if present.
- **Uncertainty:** If the case explicitly says something is “unclear/unknown,” include that wording.
- **Final diagnosis:** Write a concise paragraph that **states the diagnosis, the causative agent if given, and the evidence** (labs/imaging/epidemiology/response to therapy).
- **Disease name (short):** After `final_diagnosis`, fill `disease_name_short` with the **best disease name only** (e.g., “Dengue fever”, “Falciparum malaria”, “Scrub typhus”).
---
Return **only** this JSON schema (exact keys, same order):
```json
{
  "patient_information": null,
  "chief_complaint": null,
  "history_of_present_illness": null,
  "exposure_and_epidemiology": null,
  "vitals": null,
  "physical_exam": null,
  "labs_and_diagnostics": null,
  "differential_diagnosis": null,
  "management_and_clinical_course": null,
  "final_diagnosis": null,
  "disease_name_short": null
}
```
### Field guidance (concise)
- **patient_information**: age/sex; relevant comorbidities/immunosuppression; vaccination/allergy info if stated.
- **chief_complaint**: one-line problem + duration.
- **history_of_present_illness**: timeline, key symptoms, pertinent negatives, severity pattern.
- **exposure_and_epidemiology**: residence/travel (place/setting, dates if present), vectors (mosquito/tick), animals, water/food risks, contacts, season/outbreak context, occupation.
- **vitals**: all reported vital signs as text (fever values, BP, HR, RR, SpO₂).
- **physical_exam**: salient systems (skin, HEENT, chest, abdo, neuro, lymph, etc.).
- **labs_and_diagnostics**: CBC trends, key chem/coag, inflammatory markers, microbiology/serology/PCR (assay + result + titer/CT if present), malaria tests, imaging highlights.
- **differential_diagnosis**: succinct narrative of the main alternatives considered, with one-sentence justification for/against each.
- **management_and_clinical_course**: antimicrobials (drug/dose if given), supportive care, procedures; response, complications, outcome.
- **final_diagnosis**: 3–5 sentences: explicit disease name ± causative agent, confirmation method (e.g., NS1+, thick smear species, PCR/serology), and why alternatives were ruled out.
- **disease_name_short**: the disease name only (no agent, no method).
---
## Few-shot (mini)
**Input (excerpt)** “32-year-old woman returned from rural Thailand 7 days ago. High fever to 39.5 °C, severe myalgias, retro-orbital pain, and maculopapular rash. Platelets 72×10^9/L, leukopenia. Dengue NS1 antigen positive on day 2. Managed with fluids and acetaminophen; no antibiotics. Recovered by day 6.”
**Expected Output**
```json
{
  "patient_information": "32-year-old woman with no reported comorbidities or immunosuppression.",
  "chief_complaint": "Acute high fever with rash for approximately one week.",
  "history_of_present_illness": "Abrupt febrile illness reaching 39.5 °C with severe myalgias and retro-orbital pain, followed by a maculopapular rash. Symptoms began shortly after travel and evolved over 7 days.",
  "exposure_and_epidemiology": "Recent travel to rural Thailand one week prior; mosquito exposure is implied by the endemic setting. No animal or water/food exposures or sick contacts were reported.",
  "vitals": "Maximum recorded temperature 39.5 °C; other vital signs not reported.",
  "physical_exam": "Maculopapular rash and retro-orbital discomfort; no focal neurologic deficits or respiratory distress described.",
  "labs_and_diagnostics": "CBC notable for thrombocytopenia (platelets 72×10^9/L) and leukopenia. Dengue NS1 antigen positive on day 2. No imaging was reported.",
  "differential_diagnosis": "Considered dengue fever versus chikungunya and malaria. Dengue supported by NS1 positivity, thrombocytopenia, and travel to an endemic area. Malaria less likely given dengue confirmation and no malaria-specific findings reported.",
  "management_and_clinical_course": "Supportive care with oral/IV fluids and acetaminophen; no antibiotics. Clinical improvement with full recovery by day 6; no complications documented.",
  "final_diagnosis": "Dengue fever due to dengue virus, confirmed by a positive NS1 antigen test. The diagnosis aligns with acute febrile illness, thrombocytopenia, retro-orbital pain, and compatible travel exposure. Malaria and chikungunya were considered less likely given confirmatory testing and symptom pattern.",
  "disease_name_short": "Dengue fever"
}
```
---
## User
Extract the following fields as **full-text blocks** from this clinical case:
"""

# 3. Cấu hình mô hình Gemini
# SỬA LỖI: Đổi từ 'gemini-1.5-flash-001' thành 'gemini-1.5-flash'
# Đây là phiên bản "mới nhất" thay vì "ổn định"
model = GenerativeModel(
    "gemini-2.5-pro",
    system_instruction=SYSTEM_PROMPT,
)

# Cấu hình để bắt buộc trả về JSON
generation_config = {
    "response_mime_type": "application/json",
}

def process_all_pdfs_vertex():
    """
    Hàm chính để lặp qua tất cả các file PDF, xử lý và lưu JSON
    sử dụng Vertex AI.
    """
    print(f"\nBắt đầu quá trình trích xuất (Vertex AI)...")
    print(f"Folder PDF nguồn: {PDF_DIR}")
    print(f"Folder JSON đích: {JSON_DIR}")

    pathlib.Path(JSON_DIR).mkdir(parents=True, exist_ok=True)

    pdf_files = glob.glob(os.path.join(PDF_DIR, "*.pdf"))

    if not pdf_files:
        print(f"!!! KHÔNG TÌM THẤY FILE PDF NÀO trong: {PDF_DIR}")
        return

    print(f"Tìm thấy tổng cộng {len(pdf_files)} file PDF.")

    pending_files = []
    for pdf_path in pdf_files:
        pdf_filename = os.path.basename(pdf_path)
        json_filename = os.path.splitext(pdf_filename)[0] + ".json"
        json_path = os.path.join(JSON_DIR, json_filename)
        if not os.path.exists(json_path):
            pending_files.append(pdf_path)

    print(f"Trong đó, {len(pending_files)} file chưa được xử lý. Bắt đầu xử lý...")

    for pdf_path in tqdm(pending_files, desc="Đang xử lý PDF (Vertex)"):
        pdf_filename = os.path.basename(pdf_path)
        json_filename = os.path.splitext(pdf_filename)[0] + ".json"
        json_path = os.path.join(JSON_DIR, json_filename)

        try:
            tqdm.write(f"\nĐang xử lý: {pdf_filename}")

            # 1. Đọc file PDF từ Google Drive
            tqdm.write("...Đang đọc file PDF...")
            pdf_bytes = pathlib.Path(pdf_path).read_bytes()

            # 2. Chuẩn bị "Part" cho API
            # Vertex AI nhận trực tiếp file bytes, không cần "upload"
            pdf_file_part = Part.from_data(
                data=pdf_bytes,
                mime_type="application/pdf"
            )

            # Nội dung prompt
            # (Chúng ta không cần thêm prompt "User" vì đã có trong System Prompt)
            contents = [pdf_file_part]

            # 3. Gọi API Gemini
            tqdm.write("...Đang trích xuất (Gemini đang đọc)...")
            response = model.generate_content(
                contents,
                generation_config=generation_config
            )

            # 4. Lưu kết quả JSON
            json_output = response.text
            with open(json_path, 'w', encoding='utf-8') as f:
                f.write(json_output)

            tqdm.write(f"==> THÀNH CÔNG: Đã lưu {json_filename}")

        except Exception as e:
            tqdm.write(f"!!! THẤT BẠI khi xử lý {pdf_filename}: {e}")

        finally:
            # Nghỉ 1 giây để tránh bị rate limit (lỗi 429)
            time.sleep(1)

# Chạy hàm chính
if __name__ == "__main__":
    process_all_pdfs_vertex()

Vertex AI đã khởi tạo thành công cho Project: gen-lang-client-0264433577 tại us-central1

Bắt đầu quá trình trích xuất (Vertex AI)...
Folder PDF nguồn: /content/drive/MyDrive/Project-AI/Data/pdf-data
Folder JSON đích: /content/drive/MyDrive/Project-AI/Data/structure-data-json
Tìm thấy tổng cộng 93 file PDF.
Trong đó, 93 file chưa được xử lý. Bắt đầu xử lý...


Đang xử lý PDF (Vertex):   0%|          | 0/93 [00:00<?, ?it/s]


Đang xử lý: 1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   0%|          | 0/93 [00:14<?, ?it/s]

==> THÀNH CÔNG: Đã lưu 1---A-20-Year-Old-Woman-from-Sudan-With-Fever--_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):   1%|          | 1/93 [00:15<23:08, 15.09s/it]


Đang xử lý: 3---A-26-Year-Old-Woman-from-Malawi-with-Headache-_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   1%|          | 1/93 [00:38<23:08, 15.09s/it]

==> THÀNH CÔNG: Đã lưu 3---A-26-Year-Old-Woman-from-Malawi-with-Headache-_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):   2%|▏         | 2/93 [00:39<31:10, 20.56s/it]


Đang xử lý: 5---A-4-Year-Old-Boy-from-Laos-With-a-Lesion-o_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   2%|▏         | 2/93 [01:00<31:10, 20.56s/it]

==> THÀNH CÔNG: Đã lưu 5---A-4-Year-Old-Boy-from-Laos-With-a-Lesion-o_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):   3%|▎         | 3/93 [01:01<31:39, 21.11s/it]


Đang xử lý: 2---A-7-Year-Old-Girl-from-Peru-With-a-Chron_2022_Clinical-Cases-in-Tropical.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   3%|▎         | 3/93 [01:24<31:39, 21.11s/it]

==> THÀNH CÔNG: Đã lưu 2---A-7-Year-Old-Girl-from-Peru-With-a-Chron_2022_Clinical-Cases-in-Tropical.json


Đang xử lý PDF (Vertex):   4%|▍         | 4/93 [01:25<33:13, 22.40s/it]


Đang xử lý: 4---A-4-Year-Old-Girl-from-Uganda-in-a-_2022_Clinical-Cases-in-Tropical-Medi.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   4%|▍         | 4/93 [01:51<33:13, 22.40s/it]

==> THÀNH CÔNG: Đã lưu 4---A-4-Year-Old-Girl-from-Uganda-in-a-_2022_Clinical-Cases-in-Tropical-Medi.json


Đang xử lý PDF (Vertex):   5%|▌         | 5/93 [01:52<35:15, 24.04s/it]


Đang xử lý: 6---A-36-Year-Old-Male-Traveller-Returning-from-B_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   5%|▌         | 5/93 [02:16<35:15, 24.04s/it]

==> THÀNH CÔNG: Đã lưu 6---A-36-Year-Old-Male-Traveller-Returning-from-B_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):   6%|▋         | 6/93 [02:17<35:05, 24.20s/it]


Đang xử lý: 9---A-52-Year-Old-Man-from-Vietnam-With-Evo_2022_Clinical-Cases-in-Tropical-.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   6%|▋         | 6/93 [02:40<35:05, 24.20s/it]

==> THÀNH CÔNG: Đã lưu 9---A-52-Year-Old-Man-from-Vietnam-With-Evo_2022_Clinical-Cases-in-Tropical-.json


Đang xử lý PDF (Vertex):   8%|▊         | 7/93 [02:41<34:51, 24.32s/it]


Đang xử lý: 7---A-28-Year-Old-Male-Fisherman-from-Malawi-Wi_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   8%|▊         | 7/93 [03:07<34:51, 24.32s/it]

==> THÀNH CÔNG: Đã lưu 7---A-28-Year-Old-Male-Fisherman-from-Malawi-Wi_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):   9%|▊         | 8/93 [03:08<35:36, 25.13s/it]


Đang xử lý: 8---A-26-Year-Old-Female-Traveller-Returning-from_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):   9%|▊         | 8/93 [03:26<35:36, 25.13s/it]

==> THÀNH CÔNG: Đã lưu 8---A-26-Year-Old-Female-Traveller-Returning-from_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  10%|▉         | 9/93 [03:27<32:32, 23.24s/it]


Đang xử lý: 10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  10%|▉         | 9/93 [03:47<32:32, 23.24s/it]

==> THÀNH CÔNG: Đã lưu 10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  11%|█         | 10/93 [03:48<31:16, 22.61s/it]


Đang xử lý: 11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  11%|█         | 10/93 [04:20<31:16, 22.61s/it]

==> THÀNH CÔNG: Đã lưu 11---A-45-Year-Old-Male-Security-Guard-from-Malawi-_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  12%|█▏        | 11/93 [04:21<34:55, 25.55s/it]


Đang xử lý: 13---A-16-Year-Old-Girl-from-Malawi-With-Fever_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  12%|█▏        | 11/93 [04:42<34:55, 25.55s/it]

==> THÀNH CÔNG: Đã lưu 13---A-16-Year-Old-Girl-from-Malawi-With-Fever_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  13%|█▎        | 12/93 [04:43<33:09, 24.57s/it]


Đang xử lý: 12---A-29-Year-Old-Man-from-The-Gambia-With-G_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  13%|█▎        | 12/93 [05:02<33:09, 24.57s/it]

==> THÀNH CÔNG: Đã lưu 12---A-29-Year-Old-Man-from-The-Gambia-With-G_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  14%|█▍        | 13/93 [05:03<31:04, 23.31s/it]


Đang xử lý: 14---A-22-Year-Old-Woman-from-Bangladesh-With-P_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  14%|█▍        | 13/93 [05:23<31:04, 23.31s/it]

==> THÀNH CÔNG: Đã lưu 14---A-22-Year-Old-Woman-from-Bangladesh-With-P_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  15%|█▌        | 14/93 [05:24<29:34, 22.47s/it]


Đang xử lý: 16---A-25-Year-Old-Female-School-Teacher-from-Malaw_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...
...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  15%|█▌        | 14/93 [05:52<29:34, 22.47s/it]

==> THÀNH CÔNG: Đã lưu 16---A-25-Year-Old-Female-School-Teacher-from-Malaw_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  16%|█▌        | 15/93 [05:53<31:54, 24.54s/it]


Đang xử lý: 15---A-3-Year-Old-Boy-from-Laos-With-Right-Sup_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  16%|█▌        | 15/93 [05:54<31:54, 24.54s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  16%|█▌        | 15/93 [06:21<31:54, 24.54s/it]

==> THÀNH CÔNG: Đã lưu 15---A-3-Year-Old-Boy-from-Laos-With-Right-Sup_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  17%|█▋        | 16/93 [06:22<33:02, 25.74s/it]


Đang xử lý: 17---A-34-Year-Old-Man-from-Thailand-With-Feve_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  17%|█▋        | 16/93 [06:22<33:02, 25.74s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  17%|█▋        | 16/93 [06:41<33:02, 25.74s/it]

==> THÀNH CÔNG: Đã lưu 17---A-34-Year-Old-Man-from-Thailand-With-Feve_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  18%|█▊        | 17/93 [06:42<30:38, 24.19s/it]


Đang xử lý: 20---A-43-Year-Old-Male-Traveller-Returning-from-M_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  18%|█▊        | 17/93 [06:43<30:38, 24.19s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  18%|█▊        | 17/93 [07:02<30:38, 24.19s/it]

==> THÀNH CÔNG: Đã lưu 20---A-43-Year-Old-Male-Traveller-Returning-from-M_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  19%|█▉        | 18/93 [07:03<29:04, 23.27s/it]


Đang xử lý: 21---A-35-Year-Old-American-Man-With-Fatigue-_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  19%|█▉        | 18/93 [07:04<29:04, 23.27s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  19%|█▉        | 18/93 [07:24<29:04, 23.27s/it]

==> THÀNH CÔNG: Đã lưu 21---A-35-Year-Old-American-Man-With-Fatigue-_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  20%|██        | 19/93 [07:25<27:57, 22.67s/it]


Đang xử lý: 19---A-40-Year-Old-Man-from-Togo-With-Subcutaneou_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  20%|██        | 19/93 [07:25<27:57, 22.67s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  20%|██        | 19/93 [07:53<27:57, 22.67s/it]

==> THÀNH CÔNG: Đã lưu 19---A-40-Year-Old-Man-from-Togo-With-Subcutaneou_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  22%|██▏       | 20/93 [07:54<29:56, 24.61s/it]


Đang xử lý: 23---A-31-Year-Old-HIV-Positive-Business-Traveller-W_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  22%|██▏       | 20/93 [07:54<29:56, 24.61s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  22%|██▏       | 20/93 [08:28<29:56, 24.61s/it]

==> THÀNH CÔNG: Đã lưu 23---A-31-Year-Old-HIV-Positive-Business-Traveller-W_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  23%|██▎       | 21/93 [08:30<33:32, 27.95s/it]


Đang xử lý: 24---A-14-Year-Old-Boy-from-Rural-Tanzania-With_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  23%|██▎       | 21/93 [08:30<33:32, 27.95s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  23%|██▎       | 21/93 [08:51<33:32, 27.95s/it]

==> THÀNH CÔNG: Đã lưu 24---A-14-Year-Old-Boy-from-Rural-Tanzania-With_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  24%|██▎       | 22/93 [08:52<31:05, 26.28s/it]


Đang xử lý: 25---A-72-Year-Old-Male-Farmer-from-Laos-With-Exte_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  24%|██▎       | 22/93 [08:52<31:05, 26.28s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  24%|██▎       | 22/93 [09:16<31:05, 26.28s/it]

==> THÀNH CÔNG: Đã lưu 25---A-72-Year-Old-Male-Farmer-from-Laos-With-Exte_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  25%|██▍       | 23/93 [09:17<30:09, 25.85s/it]


Đang xử lý: 26---A-14-Year-Old-Boy-from-Malawi-Who-Has-Bee_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  25%|██▍       | 23/93 [09:17<30:09, 25.85s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  25%|██▍       | 23/93 [09:47<30:09, 25.85s/it]

==> THÀNH CÔNG: Đã lưu 26---A-14-Year-Old-Boy-from-Malawi-Who-Has-Bee_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  26%|██▌       | 24/93 [09:48<31:30, 27.40s/it]


Đang xử lý: 27---A-16-Year-Old-Boy-from-Sri-Lanka-With-Fever_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  26%|██▌       | 24/93 [09:48<31:30, 27.40s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  26%|██▌       | 24/93 [10:12<31:30, 27.40s/it]

==> THÀNH CÔNG: Đã lưu 27---A-16-Year-Old-Boy-from-Sri-Lanka-With-Fever_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  27%|██▋       | 25/93 [10:13<30:15, 26.69s/it]


Đang xử lý: 29---A-35-Year-Old-Woman-from-Malawi-With-Feve_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  27%|██▋       | 25/93 [10:13<30:15, 26.69s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  27%|██▋       | 25/93 [10:37<30:15, 26.69s/it]

==> THÀNH CÔNG: Đã lưu 29---A-35-Year-Old-Woman-from-Malawi-With-Feve_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  28%|██▊       | 26/93 [10:38<29:16, 26.21s/it]


Đang xử lý: 28---A-67-Year-Old-Female-Expatriate-Living-in-Came_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  28%|██▊       | 26/93 [10:38<29:16, 26.21s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  28%|██▊       | 26/93 [11:02<29:16, 26.21s/it]

==> THÀNH CÔNG: Đã lưu 28---A-67-Year-Old-Female-Expatriate-Living-in-Came_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  29%|██▉       | 27/93 [11:03<28:21, 25.79s/it]


Đang xử lý: 22---32-Year-Old-Woman-from-Nigeria-With-Jaund_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  29%|██▉       | 27/93 [11:03<28:21, 25.79s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  29%|██▉       | 27/93 [11:27<28:21, 25.79s/it]

==> THÀNH CÔNG: Đã lưu 22---32-Year-Old-Woman-from-Nigeria-With-Jaund_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  30%|███       | 28/93 [11:28<27:43, 25.60s/it]


Đang xử lý: 30---A-12-Year-Old-Boy-from-Rural-Kenya-With_2022_Clinical-Cases-in-Tropical.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  30%|███       | 28/93 [11:28<27:43, 25.60s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  30%|███       | 28/93 [11:57<27:43, 25.60s/it]

==> THÀNH CÔNG: Đã lưu 30---A-12-Year-Old-Boy-from-Rural-Kenya-With_2022_Clinical-Cases-in-Tropical.json


Đang xử lý PDF (Vertex):  31%|███       | 29/93 [11:58<28:38, 26.86s/it]


Đang xử lý: 31---A-6-Year-Old-Boy-from-Malawi-With-Fever--Cou_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  31%|███       | 29/93 [11:58<28:38, 26.86s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  31%|███       | 29/93 [12:16<28:38, 26.86s/it]

==> THÀNH CÔNG: Đã lưu 31---A-6-Year-Old-Boy-from-Malawi-With-Fever--Cou_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  32%|███▏      | 30/93 [12:17<25:59, 24.76s/it]


Đang xử lý: 32---A-44-Year-Old-Male-Farmer-from-Laos-With-Di_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  32%|███▏      | 30/93 [12:18<25:59, 24.76s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  32%|███▏      | 30/93 [12:41<25:59, 24.76s/it]

==> THÀNH CÔNG: Đã lưu 32---A-44-Year-Old-Male-Farmer-from-Laos-With-Di_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  33%|███▎      | 31/93 [12:42<25:37, 24.79s/it]


Đang xử lý: 33---A-53-Year-Old-Man-from-Malawi-With-a-C_2022_Clinical-Cases-in-Tropical-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  33%|███▎      | 31/93 [12:43<25:37, 24.79s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  33%|███▎      | 31/93 [13:11<25:37, 24.79s/it]

==> THÀNH CÔNG: Đã lưu 33---A-53-Year-Old-Man-from-Malawi-With-a-C_2022_Clinical-Cases-in-Tropical-.json


Đang xử lý PDF (Vertex):  34%|███▍      | 32/93 [13:12<26:35, 26.15s/it]


Đang xử lý: 34---A-35-Year-Old-Male-Farmer-from-Peru-With-a-Chro_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  34%|███▍      | 32/93 [13:12<26:35, 26.15s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  34%|███▍      | 32/93 [13:38<26:35, 26.15s/it]

==> THÀNH CÔNG: Đã lưu 34---A-35-Year-Old-Male-Farmer-from-Peru-With-a-Chro_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  35%|███▌      | 33/93 [13:39<26:23, 26.39s/it]


Đang xử lý: 36---A-23-Year-Old-Farmer-from-Myanmar-With-Uni_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  35%|███▌      | 33/93 [13:39<26:23, 26.39s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  35%|███▌      | 33/93 [14:02<26:23, 26.39s/it]

==> THÀNH CÔNG: Đã lưu 36---A-23-Year-Old-Farmer-from-Myanmar-With-Uni_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  37%|███▋      | 34/93 [14:03<25:27, 25.88s/it]


Đang xử lý: 35---A-32-Year-Old-Woman-from-Malawi-With-Heada_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  37%|███▋      | 34/93 [14:04<25:27, 25.88s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  37%|███▋      | 34/93 [14:24<25:27, 25.88s/it]

==> THÀNH CÔNG: Đã lưu 35---A-32-Year-Old-Woman-from-Malawi-With-Heada_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  38%|███▊      | 35/93 [14:25<23:43, 24.54s/it]


Đang xử lý: 38---A-24-Year-Old-Female-Globetrotter-With-Strange_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  38%|███▊      | 35/93 [14:25<23:43, 24.54s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  38%|███▊      | 35/93 [14:46<23:43, 24.54s/it]

==> THÀNH CÔNG: Đã lưu 38---A-24-Year-Old-Female-Globetrotter-With-Strange_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  39%|███▊      | 36/93 [14:47<22:36, 23.79s/it]


Đang xử lý: 40---A-62-Year-Old-Woman-from-Ethiopia-With-D_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  39%|███▊      | 36/93 [14:47<22:36, 23.79s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  39%|███▊      | 36/93 [15:11<22:36, 23.79s/it]

==> THÀNH CÔNG: Đã lưu 40---A-62-Year-Old-Woman-from-Ethiopia-With-D_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  40%|███▉      | 37/93 [15:12<22:36, 24.23s/it]


Đang xử lý: 39---A-30-Year-Old-Male-Chinese-Trader-With-_2022_Clinical-Cases-in-Tropical.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  40%|███▉      | 37/93 [15:13<22:36, 24.23s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  40%|███▉      | 37/93 [15:39<22:36, 24.23s/it]

==> THÀNH CÔNG: Đã lưu 39---A-30-Year-Old-Male-Chinese-Trader-With-_2022_Clinical-Cases-in-Tropical.json


Đang xử lý PDF (Vertex):  41%|████      | 38/93 [15:40<23:17, 25.41s/it]


Đang xử lý: 41---A-7-Year-Old-Girl-from-West-Africa-With-Two-Ski_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  41%|████      | 38/93 [15:41<23:17, 25.41s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  41%|████      | 38/93 [16:05<23:17, 25.41s/it]

==> THÀNH CÔNG: Đã lưu 41---A-7-Year-Old-Girl-from-West-Africa-With-Two-Ski_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  42%|████▏     | 39/93 [16:06<22:53, 25.43s/it]


Đang xử lý: 42---A-41-Year-Old-Male-Traveller-Returning-from-Au_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  42%|████▏     | 39/93 [16:06<22:53, 25.43s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  42%|████▏     | 39/93 [16:22<22:53, 25.43s/it]

==> THÀNH CÔNG: Đã lưu 42---A-41-Year-Old-Male-Traveller-Returning-from-Au_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  43%|████▎     | 40/93 [16:23<20:25, 23.11s/it]


Đang xử lý: 45---A-2-Month-Old-Girl-from-Laos-With-Dyspnoea-_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  43%|████▎     | 40/93 [16:24<20:25, 23.11s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  43%|████▎     | 40/93 [16:51<20:25, 23.11s/it]

==> THÀNH CÔNG: Đã lưu 45---A-2-Month-Old-Girl-from-Laos-With-Dyspnoea-_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  44%|████▍     | 41/93 [16:52<21:20, 24.62s/it]


Đang xử lý: 46---A-45-Year-Old-Man-from-Sri-Lanka-With-Fever-_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  44%|████▍     | 41/93 [16:52<21:20, 24.62s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  44%|████▍     | 41/93 [17:15<21:20, 24.62s/it]

==> THÀNH CÔNG: Đã lưu 46---A-45-Year-Old-Man-from-Sri-Lanka-With-Fever-_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  45%|████▌     | 42/93 [17:16<20:48, 24.49s/it]


Đang xử lý: 44---A-7-Year-Old-Girl-from-South-Sudan-With-_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  45%|████▌     | 42/93 [17:16<20:48, 24.49s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  45%|████▌     | 42/93 [17:38<20:48, 24.49s/it]

==> THÀNH CÔNG: Đã lưu 44---A-7-Year-Old-Girl-from-South-Sudan-With-_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  46%|████▌     | 43/93 [17:39<20:02, 24.06s/it]


Đang xử lý: 37---A-29-Year-Old-Woman-from-Malawi-With-Confusi_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  46%|████▌     | 43/93 [17:39<20:02, 24.06s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  46%|████▌     | 43/93 [18:03<20:02, 24.06s/it]

==> THÀNH CÔNG: Đã lưu 37---A-29-Year-Old-Woman-from-Malawi-With-Confusi_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  47%|████▋     | 44/93 [18:04<19:50, 24.29s/it]


Đang xử lý: 43---A-35-Year-Old-Malawian-Woman-With-a-Pain_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  47%|████▋     | 44/93 [18:04<19:50, 24.29s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  47%|████▋     | 44/93 [18:33<19:50, 24.29s/it]

==> THÀNH CÔNG: Đã lưu 43---A-35-Year-Old-Malawian-Woman-With-a-Pain_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  48%|████▊     | 45/93 [18:34<20:59, 26.23s/it]


Đang xử lý: 47---A-32-Year-Old-Man-from-Malawi-With-a-Pain_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  48%|████▊     | 45/93 [18:35<20:59, 26.23s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  48%|████▊     | 45/93 [19:09<20:59, 26.23s/it]

==> THÀNH CÔNG: Đã lưu 47---A-32-Year-Old-Man-from-Malawi-With-a-Pain_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  49%|████▉     | 46/93 [19:10<22:41, 28.97s/it]


Đang xử lý: 48---A-31-Year-Old-Woman-from-Tanzania-With-Acu_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  49%|████▉     | 46/93 [19:11<22:41, 28.97s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  49%|████▉     | 46/93 [19:38<22:41, 28.97s/it]

==> THÀNH CÔNG: Đã lưu 48---A-31-Year-Old-Woman-from-Tanzania-With-Acu_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  51%|█████     | 47/93 [19:39<22:13, 28.99s/it]


Đang xử lý: 49---A-33-Year-Old-Male-Traveller-to-India-With-Di_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  51%|█████     | 47/93 [19:39<22:13, 28.99s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  51%|█████     | 47/93 [20:04<22:13, 28.99s/it]

==> THÀNH CÔNG: Đã lưu 49---A-33-Year-Old-Male-Traveller-to-India-With-Di_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  52%|█████▏    | 48/93 [20:05<21:06, 28.14s/it]


Đang xử lý: 50---A-24-Year-Old-Man-of-Turkish-Origin-With-Jau_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  52%|█████▏    | 48/93 [20:05<21:06, 28.14s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  52%|█████▏    | 48/93 [20:32<21:06, 28.14s/it]

==> THÀNH CÔNG: Đã lưu 50---A-24-Year-Old-Man-of-Turkish-Origin-With-Jau_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  53%|█████▎    | 49/93 [20:33<20:31, 27.99s/it]


Đang xử lý: 51---A-34-Year-Old-HIV-Positive-Woman-from-Malawi-W_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  53%|█████▎    | 49/93 [20:33<20:31, 27.99s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  53%|█████▎    | 49/93 [21:02<20:31, 27.99s/it]

==> THÀNH CÔNG: Đã lưu 51---A-34-Year-Old-HIV-Positive-Woman-from-Malawi-W_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  54%|█████▍    | 50/93 [21:03<20:30, 28.61s/it]


Đang xử lý: 52---A-56-Year-Old-Man-from-Peru-With-Prolonged-_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  54%|█████▍    | 50/93 [21:03<20:30, 28.61s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  54%|█████▍    | 50/93 [21:27<20:30, 28.61s/it]

==> THÀNH CÔNG: Đã lưu 52---A-56-Year-Old-Man-from-Peru-With-Prolonged-_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  55%|█████▍    | 51/93 [21:28<19:27, 27.79s/it]


Đang xử lý: 54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  55%|█████▍    | 51/93 [21:30<19:27, 27.79s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  55%|█████▍    | 51/93 [21:50<19:27, 27.79s/it]

==> THÀNH CÔNG: Đã lưu 54---A-52-Year-Old-Male-Safari-Tourist-Returning-fro_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  56%|█████▌    | 52/93 [21:51<17:50, 26.12s/it]


Đang xử lý: 55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  56%|█████▌    | 52/93 [21:51<17:50, 26.12s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  56%|█████▌    | 52/93 [22:20<17:50, 26.12s/it]

==> THÀNH CÔNG: Đã lưu 55---A-40-Year-Old-Male-Farmer-from-Peru-With-Ch_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  57%|█████▋    | 53/93 [22:21<18:15, 27.40s/it]


Đang xử lý: 57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  57%|█████▋    | 53/93 [22:22<18:15, 27.40s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  57%|█████▋    | 53/93 [22:45<18:15, 27.40s/it]

==> THÀNH CÔNG: Đã lưu 57---A-37-Year-Old-Woman-from-Malawi-With-H_2022_Clinical-Cases-in-Tropical-.json


Đang xử lý PDF (Vertex):  58%|█████▊    | 54/93 [22:46<17:18, 26.62s/it]


Đang xử lý: 53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  58%|█████▊    | 54/93 [22:46<17:18, 26.62s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  58%|█████▊    | 54/93 [23:10<17:18, 26.62s/it]

==> THÀNH CÔNG: Đã lưu 53---A-24-Year-Old-Woman-from-Uganda-With-Fe_2022_Clinical-Cases-in-Tropical.json


Đang xử lý PDF (Vertex):  59%|█████▉    | 55/93 [23:11<16:39, 26.31s/it]


Đang xử lý: 58---A-25-Year-Old-Woman-from-Egypt-With-Severe-Ch_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  59%|█████▉    | 55/93 [23:12<16:39, 26.31s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  59%|█████▉    | 55/93 [23:39<16:39, 26.31s/it]

==> THÀNH CÔNG: Đã lưu 58---A-25-Year-Old-Woman-from-Egypt-With-Severe-Ch_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  60%|██████    | 56/93 [23:40<16:39, 27.02s/it]


Đang xử lý: 61---A-48-Year-Old-Woman-from-Thailand-With-Fever-_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  60%|██████    | 56/93 [23:41<16:39, 27.02s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  60%|██████    | 56/93 [24:05<16:39, 27.02s/it]

==> THÀNH CÔNG: Đã lưu 61---A-48-Year-Old-Woman-from-Thailand-With-Fever-_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  61%|██████▏   | 57/93 [24:06<16:03, 26.77s/it]


Đang xử lý: 62---A-28-Year-Old-Man-from-Ghana-With-a-Chron_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  61%|██████▏   | 57/93 [24:07<16:03, 26.77s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  61%|██████▏   | 57/93 [24:35<16:03, 26.77s/it]

==> THÀNH CÔNG: Đã lưu 62---A-28-Year-Old-Man-from-Ghana-With-a-Chron_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  62%|██████▏   | 58/93 [24:36<16:12, 27.77s/it]


Đang xử lý: 59---A-24-Year-Old-Man-from-Malawi-With-Skin-Le_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  62%|██████▏   | 58/93 [24:37<16:12, 27.77s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  62%|██████▏   | 58/93 [25:03<16:12, 27.77s/it]

==> THÀNH CÔNG: Đã lưu 59---A-24-Year-Old-Man-from-Malawi-With-Skin-Le_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  63%|██████▎   | 59/93 [25:04<15:43, 27.74s/it]


Đang xử lý: 60---A-6-Year-Old-Boy-from-Malawi-With-Proptos_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  63%|██████▎   | 59/93 [25:05<15:43, 27.74s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  63%|██████▎   | 59/93 [25:23<15:43, 27.74s/it]

==> THÀNH CÔNG: Đã lưu 60---A-6-Year-Old-Boy-from-Malawi-With-Proptos_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  65%|██████▍   | 60/93 [25:24<13:57, 25.38s/it]


Đang xử lý: 56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  65%|██████▍   | 60/93 [25:25<13:57, 25.38s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  65%|██████▍   | 60/93 [25:51<13:57, 25.38s/it]

==> THÀNH CÔNG: Đã lưu 56---A-21-Year-Old-Pregnant-Woman-from-The-Ga_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  66%|██████▌   | 61/93 [25:52<13:52, 26.03s/it]


Đang xử lý: 63---A-38-Year-Old-European-Expatriate-Living-in-M_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  66%|██████▌   | 61/93 [25:52<13:52, 26.03s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  66%|██████▌   | 61/93 [26:14<13:52, 26.03s/it]

==> THÀNH CÔNG: Đã lưu 63---A-38-Year-Old-European-Expatriate-Living-in-M_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  67%|██████▋   | 62/93 [26:15<13:02, 25.25s/it]


Đang xử lý: 64---A-40-Year-Old-Woman-from-Thailand-and-Her-Bro_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  67%|██████▋   | 62/93 [26:19<13:02, 25.25s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  67%|██████▋   | 62/93 [26:47<13:02, 25.25s/it]

==> THÀNH CÔNG: Đã lưu 64---A-40-Year-Old-Woman-from-Thailand-and-Her-Bro_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  68%|██████▊   | 63/93 [26:48<13:50, 27.70s/it]


Đang xử lý: 66---A-32-Year-Old-Man-from-Malawi-With-Pain-in-the-_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  68%|██████▊   | 63/93 [26:49<13:50, 27.70s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  68%|██████▊   | 63/93 [27:07<13:50, 27.70s/it]

==> THÀNH CÔNG: Đã lưu 66---A-32-Year-Old-Man-from-Malawi-With-Pain-in-the-_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  69%|██████▉   | 64/93 [27:08<12:09, 25.16s/it]


Đang xử lý: 67---A-24-Year-Old-Woman-from-the-Peruvian-Andes-_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  69%|██████▉   | 64/93 [27:08<12:09, 25.16s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  69%|██████▉   | 64/93 [27:33<12:09, 25.16s/it]

==> THÀNH CÔNG: Đã lưu 67---A-24-Year-Old-Woman-from-the-Peruvian-Andes-_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  70%|██████▉   | 65/93 [27:34<11:54, 25.51s/it]


Đang xử lý: 70---A-58-Year-Old-Woman-from-Sri-Lanka-With-Fev_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  70%|██████▉   | 65/93 [27:35<11:54, 25.51s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  70%|██████▉   | 65/93 [28:04<11:54, 25.51s/it]

==> THÀNH CÔNG: Đã lưu 70---A-58-Year-Old-Woman-from-Sri-Lanka-With-Fev_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  71%|███████   | 66/93 [28:05<12:11, 27.08s/it]


Đang xử lý: 65---A-4-Year-Old-Girl-from-Bolivia-With-a-Dar_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  71%|███████   | 66/93 [28:05<12:11, 27.08s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  71%|███████   | 66/93 [28:28<12:11, 27.08s/it]

==> THÀNH CÔNG: Đã lưu 65---A-4-Year-Old-Girl-from-Bolivia-With-a-Dar_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  72%|███████▏  | 67/93 [28:29<11:23, 26.27s/it]


Đang xử lý: 73---A-21-Year-Old-Male-Migrant-from-Rural-Mali-_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  72%|███████▏  | 67/93 [28:30<11:23, 26.27s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  72%|███████▏  | 67/93 [29:00<11:23, 26.27s/it]

==> THÀNH CÔNG: Đã lưu 73---A-21-Year-Old-Male-Migrant-from-Rural-Mali-_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  73%|███████▎  | 68/93 [29:01<11:38, 27.94s/it]


Đang xử lý: 68---A-31-Year-Old-Woman-from-Malawi-With-a-Gene_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  73%|███████▎  | 68/93 [29:02<11:38, 27.94s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  73%|███████▎  | 68/93 [29:24<11:38, 27.94s/it]

==> THÀNH CÔNG: Đã lưu 68---A-31-Year-Old-Woman-from-Malawi-With-a-Gene_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  74%|███████▍  | 69/93 [29:25<10:41, 26.73s/it]


Đang xử lý: 69---A-22-Year-Old-Male-Farmer-from-Rural-Ethiop_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  74%|███████▍  | 69/93 [29:25<10:41, 26.73s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  74%|███████▍  | 69/93 [29:45<10:41, 26.73s/it]

==> THÀNH CÔNG: Đã lưu 69---A-22-Year-Old-Male-Farmer-from-Rural-Ethiop_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  75%|███████▌  | 70/93 [29:46<09:38, 25.14s/it]


Đang xử lý: 74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  75%|███████▌  | 70/93 [29:47<09:38, 25.14s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  75%|███████▌  | 70/93 [30:06<09:38, 25.14s/it]

==> THÀNH CÔNG: Đã lưu 74---A-28-Year-Old-Woman-from-Sierra-Leone-With-_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  76%|███████▋  | 71/93 [30:07<08:43, 23.78s/it]


Đang xử lý: 76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  76%|███████▋  | 71/93 [30:07<08:43, 23.78s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  76%|███████▋  | 71/93 [30:27<08:43, 23.78s/it]

==> THÀNH CÔNG: Đã lưu 76---A-55-Year-Old-Woman-from-Turkey-With-Feve_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  77%|███████▋  | 72/93 [30:28<08:04, 23.06s/it]


Đang xử lý: 77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  77%|███████▋  | 72/93 [30:29<08:04, 23.06s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  77%|███████▋  | 72/93 [30:55<08:04, 23.06s/it]

==> THÀNH CÔNG: Đã lưu 77---A-51-Year-Old-Female-Traveller-Returning-from-Cen_2022_Clinical-Cases-i.json


Đang xử lý PDF (Vertex):  78%|███████▊  | 73/93 [30:56<08:10, 24.51s/it]


Đang xử lý: 71---A-71-Year-Old-Man-from-Japan-With-Eosinophili_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  78%|███████▊  | 73/93 [30:57<08:10, 24.51s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  78%|███████▊  | 73/93 [31:21<08:10, 24.51s/it]

==> THÀNH CÔNG: Đã lưu 71---A-71-Year-Old-Man-from-Japan-With-Eosinophili_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  80%|███████▉  | 74/93 [31:22<07:50, 24.77s/it]


Đang xử lý: 72---A-4-Year-Old-Boy-from-Mozambique-With-Sever_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  80%|███████▉  | 74/93 [31:22<07:50, 24.77s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  80%|███████▉  | 74/93 [31:47<07:50, 24.77s/it]

==> THÀNH CÔNG: Đã lưu 72---A-4-Year-Old-Boy-from-Mozambique-With-Sever_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  81%|████████  | 75/93 [31:48<07:36, 25.34s/it]


Đang xử lý: 75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  81%|████████  | 75/93 [31:49<07:36, 25.34s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  81%|████████  | 75/93 [32:15<07:36, 25.34s/it]

==> THÀNH CÔNG: Đã lưu 75---A-25-Year-Old-Woman-from-Zambia-With-a-N_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  82%|████████▏ | 76/93 [32:16<07:23, 26.11s/it]


Đang xử lý: 78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  82%|████████▏ | 76/93 [32:17<07:23, 26.11s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  82%|████████▏ | 76/93 [32:44<07:23, 26.11s/it]

==> THÀNH CÔNG: Đã lưu 78---A-42-Year-Old-British-Man-Living-in-Malawi_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  83%|████████▎ | 77/93 [32:45<07:12, 27.02s/it]


Đang xử lý: 81---A-33-Year-Old-Refugee-from-Afghanistan-With-_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  83%|████████▎ | 77/93 [32:47<07:12, 27.02s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  83%|████████▎ | 77/93 [33:06<07:12, 27.02s/it]

==> THÀNH CÔNG: Đã lưu 81---A-33-Year-Old-Refugee-from-Afghanistan-With-_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  84%|████████▍ | 78/93 [33:07<06:22, 25.49s/it]


Đang xử lý: 82---A-31-Year-Old-Man-from-Guatemala-With-Acute-_2022_Clinical-Cases-in-Tro.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  84%|████████▍ | 78/93 [33:08<06:22, 25.49s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  84%|████████▍ | 78/93 [33:26<06:22, 25.49s/it]

==> THÀNH CÔNG: Đã lưu 82---A-31-Year-Old-Man-from-Guatemala-With-Acute-_2022_Clinical-Cases-in-Tro.json


Đang xử lý PDF (Vertex):  85%|████████▍ | 79/93 [33:27<05:31, 23.67s/it]


Đang xử lý: 80---A-62-Year-Old-Man-from-Thailand-With-a_2022_Clinical-Cases-in-Tropical-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  85%|████████▍ | 79/93 [33:27<05:31, 23.67s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  85%|████████▍ | 79/93 [33:51<05:31, 23.67s/it]

==> THÀNH CÔNG: Đã lưu 80---A-62-Year-Old-Man-from-Thailand-With-a_2022_Clinical-Cases-in-Tropical-.json


Đang xử lý PDF (Vertex):  86%|████████▌ | 80/93 [33:52<05:15, 24.24s/it]


Đang xử lý: 79---A-34-Year-Old-Male-Immigrant-from-Peru-With-Ch_2022_Clinical-Cases-in-T.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  86%|████████▌ | 80/93 [33:53<05:15, 24.24s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  86%|████████▌ | 80/93 [34:15<05:15, 24.24s/it]

==> THÀNH CÔNG: Đã lưu 79---A-34-Year-Old-Male-Immigrant-from-Peru-With-Ch_2022_Clinical-Cases-in-T.json


Đang xử lý PDF (Vertex):  87%|████████▋ | 81/93 [34:16<04:50, 24.25s/it]


Đang xử lý: 83---An-18-Year-Old-Man-from-India-With-a-Pale-P_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  87%|████████▋ | 81/93 [34:17<04:50, 24.25s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  87%|████████▋ | 81/93 [34:37<04:50, 24.25s/it]

==> THÀNH CÔNG: Đã lưu 83---An-18-Year-Old-Man-from-India-With-a-Pale-P_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  88%|████████▊ | 82/93 [34:38<04:17, 23.37s/it]


Đang xử lý: 84---A-64-Year-Old-Japanese-Man-With-Generalize_2022_Clinical-Cases-in-Tropi.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  88%|████████▊ | 82/93 [34:40<04:17, 23.37s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  88%|████████▊ | 82/93 [35:00<04:17, 23.37s/it]

==> THÀNH CÔNG: Đã lưu 84---A-64-Year-Old-Japanese-Man-With-Generalize_2022_Clinical-Cases-in-Tropi.json


Đang xử lý PDF (Vertex):  89%|████████▉ | 83/93 [35:01<03:54, 23.49s/it]


Đang xử lý: 87---A-27-Year-Old-Male-Traveller-Returning-from-the_2022_Clinical-Cases-in-.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  89%|████████▉ | 83/93 [35:02<03:54, 23.49s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  89%|████████▉ | 83/93 [35:28<03:54, 23.49s/it]

==> THÀNH CÔNG: Đã lưu 87---A-27-Year-Old-Male-Traveller-Returning-from-the_2022_Clinical-Cases-in-.json


Đang xử lý PDF (Vertex):  90%|█████████ | 84/93 [35:29<03:43, 24.79s/it]


Đang xử lý: 88---A-74-Year-Old-Man-from-Japan-With-Fever--_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  90%|█████████ | 84/93 [35:30<03:43, 24.79s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  90%|█████████ | 84/93 [35:52<03:43, 24.79s/it]

==> THÀNH CÔNG: Đã lưu 88---A-74-Year-Old-Man-from-Japan-With-Fever--_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  91%|█████████▏| 85/93 [35:53<03:15, 24.47s/it]


Đang xử lý: 85---A-55-Year-Old-Female-Pig-Farmer-from-Vietnam-_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  91%|█████████▏| 85/93 [35:54<03:15, 24.47s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  91%|█████████▏| 85/93 [36:22<03:15, 24.47s/it]

==> THÀNH CÔNG: Đã lưu 85---A-55-Year-Old-Female-Pig-Farmer-from-Vietnam-_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  92%|█████████▏| 86/93 [36:23<03:02, 26.11s/it]


Đang xử lý: 86---A-14-Year-Old-Girl-in-the-Solomon-Islands-W_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  92%|█████████▏| 86/93 [36:24<03:02, 26.11s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  92%|█████████▏| 86/93 [36:42<03:02, 26.11s/it]

==> THÀNH CÔNG: Đã lưu 86---A-14-Year-Old-Girl-in-the-Solomon-Islands-W_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  94%|█████████▎| 87/93 [36:43<02:25, 24.19s/it]


Đang xử lý: 91---A-20-Year-Old-Male-from-India-With-Fever-_2022_Clinical-Cases-in-Tropic.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  94%|█████████▎| 87/93 [36:43<02:25, 24.19s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  94%|█████████▎| 87/93 [37:07<02:25, 24.19s/it]

==> THÀNH CÔNG: Đã lưu 91---A-20-Year-Old-Male-from-India-With-Fever-_2022_Clinical-Cases-in-Tropic.json


Đang xử lý PDF (Vertex):  95%|█████████▍| 88/93 [37:08<02:01, 24.39s/it]


Đang xử lý: 90---A-55-Year-Old-Couple-Both-Returning-from-Chile-a_2022_Clinical-Cases-in.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  95%|█████████▍| 88/93 [37:08<02:01, 24.39s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  95%|█████████▍| 88/93 [37:31<02:01, 24.39s/it]

==> THÀNH CÔNG: Đã lưu 90---A-55-Year-Old-Couple-Both-Returning-from-Chile-a_2022_Clinical-Cases-in.json


Đang xử lý PDF (Vertex):  96%|█████████▌| 89/93 [37:32<01:37, 24.40s/it]


Đang xử lý: 89---A-30-Year-Old-Woman-from-Bolivia-With-Ex_2022_Clinical-Cases-in-Tropica.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  96%|█████████▌| 89/93 [37:33<01:37, 24.40s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  96%|█████████▌| 89/93 [38:03<01:37, 24.40s/it]

==> THÀNH CÔNG: Đã lưu 89---A-30-Year-Old-Woman-from-Bolivia-With-Ex_2022_Clinical-Cases-in-Tropica.json


Đang xử lý PDF (Vertex):  97%|█████████▋| 90/93 [38:04<01:20, 26.74s/it]


Đang xử lý: 93---A-35-Year-Old-Male-Logger-from-Peru-With-Fe_2022_Clinical-Cases-in-Trop.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  97%|█████████▋| 90/93 [38:05<01:20, 26.74s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  97%|█████████▋| 90/93 [38:31<01:20, 26.74s/it]

==> THÀNH CÔNG: Đã lưu 93---A-35-Year-Old-Male-Logger-from-Peru-With-Fe_2022_Clinical-Cases-in-Trop.json


Đang xử lý PDF (Vertex):  98%|█████████▊| 91/93 [38:32<00:53, 26.96s/it]


Đang xử lý: 92---A-42-Year-Old-Traveller-Returning-from-Thaila_2022_Clinical-Cases-in-Tr.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  98%|█████████▊| 91/93 [38:32<00:53, 26.96s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  98%|█████████▊| 91/93 [38:54<00:53, 26.96s/it]

==> THÀNH CÔNG: Đã lưu 92---A-42-Year-Old-Traveller-Returning-from-Thaila_2022_Clinical-Cases-in-Tr.json


Đang xử lý PDF (Vertex):  99%|█████████▉| 92/93 [38:55<00:26, 26.03s/it]


Đang xử lý: 94---A-20-Year-Old-Woman-from-the-Democratic-Republic-_2022_Clinical-Cases-i.pdf
...Đang đọc file PDF...


Đang xử lý PDF (Vertex):  99%|█████████▉| 92/93 [38:56<00:26, 26.03s/it]

...Đang trích xuất (Gemini đang đọc)...


Đang xử lý PDF (Vertex):  99%|█████████▉| 92/93 [39:19<00:26, 26.03s/it]

==> THÀNH CÔNG: Đã lưu 94---A-20-Year-Old-Woman-from-the-Democratic-Republic-_2022_Clinical-Cases-i.json


Đang xử lý PDF (Vertex): 100%|██████████| 93/93 [39:20<00:00, 25.38s/it]
