In [1]:
import requests
import pandas as pd
import time
import json
from pathlib import Path
from typing import Dict, Any, List
from datetime import datetime
import socket
from urllib.parse import urlparse

In [None]:
API_BASE_URL = "http://localhost:8000"
AIRFLOW_URL = "http://localhost:8080"

# Host & port c√°c service
POSTGRES_HOST = "localhost"
POSTGRES_PORT = 5432          # ho·∫∑c 5433 n·∫øu mu·ªën connect airflow-db
REDIS_HOST = "localhost"
REDIS_PORT = 6379
OPENSEARCH_HOST = "http://localhost:9200"
OLLAMA_HOST = "http://localhost:11434"

# # ƒê∆∞·ªùng d·∫´n files
# DATA_DIR = Path("data")
# PDF_DIR = DATA_DIR / "pdf"
# QUESTION_FILE = DATA_DIR / "q.csv"
# OUTPUT_FILE = DATA_DIR / "answers_output.csv"

# C·∫•u h√¨nh RAG
TOP_K = 5
USE_HYBRID = True
TIMEOUT = 60

In [3]:
def print_header(text: str):
    print("\n" + "=" * 80)
    print(f"  {text}")
    print("=" * 80)

def print_step(step_num: int, text: str):
    print(f"\n{'‚îÄ' * 80}")
    print(f"üìå B∆Ø·ªöC {step_num}: {text}")
    print(f"{'‚îÄ' * 80}")

def check_system_health() -> bool:
    """Ki·ªÉm tra health c·ªßa h·ªá th·ªëng."""
    print_step(0, "Ki·ªÉm tra h·ªá th·ªëng")

    services = {
        "api": f"{API_BASE_URL}/health",
        "postgres": f"postgresql://{POSTGRES_HOST}:{POSTGRES_PORT}",
        "redis": f"redis://{REDIS_HOST}:{REDIS_PORT}",
        "opensearch": f"{OPENSEARCH_HOST}",
        "ollama": f"{OLLAMA_HOST}",
    }

    all_healthy = True
    for service, url in services.items():
        try:
            if service == "api":
                response = requests.get(url, timeout=5)
                status = response.json().get("status", "unknown")
            else:
                # Ch·ªâ ki·ªÉm tra port m·ªü
                parsed = urlparse(url)
                host = parsed.hostname
                port = parsed.port
                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                sock.settimeout(2)
                sock.connect((host, port))
                sock.close()
                status = "healthy"

            icon = "‚úÖ" if status in ["healthy", "ready", "connected"] else "‚ùå"
            print(f"  {icon} {service}: {status}")
            if status not in ["healthy", "ready", "connected"]:
                all_healthy = False
        except Exception as e:
            print(f"  ‚ùå {service}: {e}")
            all_healthy = False

    if not all_healthy:
        print("\n‚ö†Ô∏è  C·∫¢NH B√ÅO: M·ªôt s·ªë services ch∆∞a s·∫µn s√†ng!")
        print("Vui l√≤ng ch·∫°y: docker compose up -d")
        return False

    print("\n‚úÖ H·ªá th·ªëng ho·∫°t ƒë·ªông b√¨nh th∆∞·ªùng!")
    return True

In [4]:
def check_pdf_files() -> List[Path]:
    """Ki·ªÉm tra PDF files trong folder."""
    print_step(1, "Ki·ªÉm tra PDF files")

    if not PDF_DIR.exists():
        print(f"\n‚ö†Ô∏è  Th∆∞ m·ª•c kh√¥ng t·ªìn t·∫°i: {PDF_DIR}")
        PDF_DIR.mkdir(parents=True, exist_ok=True)
        print(f"‚úÖ ƒê√£ t·∫°o th∆∞ m·ª•c: {PDF_DIR}")

    pdf_files = list(PDF_DIR.glob("*.pdf"))

    if pdf_files:
        print(f"\n‚úÖ T√¨m th·∫•y {len(pdf_files)} file(s) PDF:")
        for i, pdf_file in enumerate(pdf_files, 1):
            size_mb = pdf_file.stat().st_size / (1024 * 1024)
            print(f"  {i}. {pdf_file.name} ({size_mb:.2f} MB)")
    else:
        print(f"\n‚ö†Ô∏è  Ch∆∞a c√≥ PDF n√†o trong: {PDF_DIR}")
        print("Vui l√≤ng copy PDF files v√†o folder n√†y!")
        return []

    return pdf_files

In [5]:
def trigger_pdf_processing():
    """H∆∞·ªõng d·∫´n trigger Airflow DAG ƒë·ªÉ x·ª≠ l√Ω PDF."""
    print_step(2, "X·ª≠ l√Ω PDF files")

    print("\nüìù ƒê·ªÉ x·ª≠ l√Ω PDF, vui l√≤ng ch·ªçn m·ªôt trong hai c√°ch:")
    print("\n1Ô∏è‚É£  S·ª≠ d·ª•ng Airflow UI:")
    print(f"   - M·ªü: {AIRFLOW_URL}")
    print("   - Login: admin / admin")
    print("   - T√¨m DAG: 'pdf_ingestion_dag'")
    print("   - Click n√∫t 'Trigger DAG' (‚ñ∂Ô∏è)")

    print("\n2Ô∏è‚É£  S·ª≠ d·ª•ng CLI:")
    print("   docker exec rag-mcq-airflow airflow dags trigger pdf_ingestion_dag")

    print("\n‚è±Ô∏è  Th·ªùi gian x·ª≠ l√Ω: ~2-5 ph√∫t/PDF")
    print("\nüìä Quy tr√¨nh:")
    print("   ‚Üí Parse PDF (Docling + GROBID)")
    print("   ‚Üí Chunk documents (section-aware)")
    print("   ‚Üí Generate embeddings (768D)")
    print("   ‚Üí Index to OpenSearch")

    # H·ªèi user c√≥ mu·ªën ƒë·ª£i kh√¥ng
    input("\n‚è∏Ô∏è  Nh·∫•n ENTER sau khi ƒë√£ trigger DAG v√† ƒë·ª£i x·ª≠ l√Ω xong...")

In [6]:
def check_indexed_documents() -> int:
    """Ki·ªÉm tra s·ªë l∆∞·ª£ng documents ƒë√£ ƒë∆∞·ª£c index."""
    print_step(3, "Ki·ªÉm tra documents ƒë√£ index")

    try:
        opensearch_auth = ("admin", "admin")
        response = requests.get(
            "http://localhost:9200/mcq-documents/_count",
            auth=opensearch_auth,
            verify=False,
            timeout=10,
        )

        if response.status_code == 200:
            count = response.json().get("count", 0)
            print(f"\nüìÑ S·ªë l∆∞·ª£ng document chunks: {count}")

            if count > 0:
                print("‚úÖ H·ªá th·ªëng ƒë√£ s·∫µn s√†ng ƒë·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi!")
            else:
                print("‚ö†Ô∏è  Ch∆∞a c√≥ documents n√†o. Vui l√≤ng ch·∫°y Airflow DAG tr∆∞·ªõc!")

            return count
        else:
            print(f"‚ùå L·ªói khi query OpenSearch: {response.status_code}")
            return 0

    except Exception as e:
        print(f"‚ùå Kh√¥ng th·ªÉ k·∫øt n·ªëi OpenSearch: {e}")
        return 0

In [7]:
def load_questions() -> pd.DataFrame:
    """ƒê·ªçc c√¢u h·ªèi t·ª´ CSV file."""
    print_step(4, "ƒê·ªçc c√¢u h·ªèi t·ª´ CSV")

    if not QUESTION_FILE.exists():
        print(f"\n‚ùå File kh√¥ng t·ªìn t·∫°i: {QUESTION_FILE}")
        print("Vui l√≤ng t·∫°o file question.csv v·ªõi format:")
        print("  Question,A,B,C,D,source_folder")
        return None

    try:
        df = pd.read_csv(QUESTION_FILE)
        print(f"\n‚úÖ ƒê·ªçc th√†nh c√¥ng {len(df)} c√¢u h·ªèi")

        # Ki·ªÉm tra columns
        required_cols = ["Question", "A", "B", "C", "D"]
        missing_cols = [col for col in required_cols if col not in df.columns]

        if missing_cols:
            print(f"‚ùå Thi·∫øu columns: {missing_cols}")
            return None

        # Hi·ªÉn th·ªã sample
        print("\nüìù V√≠ d·ª• c√¢u h·ªèi ƒë·∫ßu ti√™n:")
        print(f"   Q: {df.iloc[0]['Question'][:100]}...")

        return df

    except Exception as e:
        print(f"‚ùå L·ªói khi ƒë·ªçc CSV: {e}")
        return None

In [8]:
def ask_single_question(
    question: str, options: Dict[str, str], source_folder: str = None
) -> Dict[str, Any]:
    """Tr·∫£ l·ªùi m·ªôt c√¢u h·ªèi MCQ."""

    payload = {"question": question, "options": options, "top_k": TOP_K, "use_hybrid": USE_HYBRID}

    if source_folder and pd.notna(source_folder):
        payload["source_folder"] = source_folder

    try:
        response = requests.post(f"{API_BASE_URL}/api/v1/ask", json=payload, timeout=TIMEOUT)
        response.raise_for_status()
        return response.json()

    except requests.exceptions.Timeout:
        print(f"    ‚è±Ô∏è  Timeout (>{TIMEOUT}s)")
        return {"error": "timeout", "predicted_option": None}

    except Exception as e:
        print(f"    ‚ùå L·ªói: {str(e)[:100]}")
        return {"error": str(e), "predicted_option": None}

In [9]:
def answer_all_questions(df: pd.DataFrame) -> pd.DataFrame:
    """Tr·∫£ l·ªùi t·∫•t c·∫£ c√¢u h·ªèi trong dataframe."""
    print_step(5, "Tr·∫£ l·ªùi c√¢u h·ªèi")

    results = []
    total = len(df)

    print(f"\nüéØ B·∫Øt ƒë·∫ßu tr·∫£ l·ªùi {total} c√¢u h·ªèi...\n")

    start_time = time.time()

    for idx, row in df.iterrows():
        q_num = idx + 1

        # Progress
        progress = (q_num / total) * 100
        print(f"[{q_num}/{total}] ({progress:.1f}%) ", end="")
        print(f"Q: {row['Question'][:60]}...")

        # Chu·∫©n b·ªã options
        options = {"A": str(row["A"]), "B": str(row["B"]), "C": str(row["C"]), "D": str(row["D"])}

        # L·∫•y source_folder n·∫øu c√≥
        source_folder = row.get("source_folder", None)

        # G·ªçi API
        q_start = time.time()
        answer_data = ask_single_question(
            question=row["Question"], options=options, source_folder=source_folder
        )
        q_time = time.time() - q_start

        # Parse k·∫øt qu·∫£
        predicted = answer_data.get("predicted_option", "N/A")
        confidence = answer_data.get("confidence", "unknown")
        reasoning = answer_data.get("reasoning", "")
        error = answer_data.get("error", None)

        # Hi·ªÉn th·ªã k·∫øt qu·∫£
        conf_icon = {"high": "üü¢", "medium": "üü°", "low": "üî¥"}.get(confidence, "‚ö™")

        print(f"    ‚Üí ƒê√°p √°n: {predicted} {conf_icon} ({q_time:.1f}s)")

        # L∆∞u k·∫øt qu·∫£
        result = {
            "question_number": q_num,
            "question": row["Question"],
            "option_A": row["A"],
            "option_B": row["B"],
            "option_C": row["C"],
            "option_D": row["D"],
            "source_folder": source_folder if pd.notna(source_folder) else "",
            "predicted_answer": predicted,
            "confidence": confidence,
            "reasoning": reasoning[:500] if reasoning else "",  # Gi·ªõi h·∫°n ƒë·ªô d√†i
            "processing_time_seconds": round(q_time, 2),
            "error": error if error else "",
            "timestamp": datetime.now().isoformat(),
        }

        # Th√™m timing details n·∫øu c√≥
        if "timing" in answer_data:
            timing = answer_data["timing"]
            result["retrieval_time_ms"] = timing.get("retrieval_ms", 0)
            result["generation_time_ms"] = timing.get("generation_ms", 0)

        results.append(result)

        # Ng·∫Øt d√≤ng sau m·ªói c√¢u
        if q_num % 5 == 0:
            print()

    total_time = time.time() - start_time

    print("\n" + "‚îÄ" * 80)
    print(f"‚úÖ Ho√†n th√†nh! T·ªïng th·ªùi gian: {total_time:.1f}s")
    print(f"‚è±Ô∏è  Trung b√¨nh: {total_time / total:.1f}s/c√¢u")
    print("‚îÄ" * 80)

    return pd.DataFrame(results)

In [10]:
def save_results(results_df: pd.DataFrame):
    """L∆∞u k·∫øt qu·∫£ ra CSV file."""
    print_step(6, "L∆∞u k·∫øt qu·∫£")

    try:
        results_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

        print(f"\n‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o: {OUTPUT_FILE}")
        print(f"üìä T·ªïng s·ªë c√¢u: {len(results_df)}")

        # Th·ªëng k√™
        conf_counts = results_df["confidence"].value_counts()
        print("\nüìà Th·ªëng k√™ ƒë·ªô tin c·∫≠y:")
        for conf, count in conf_counts.items():
            percentage = (count / len(results_df)) * 100
            print(f"  ‚Ä¢ {conf}: {count} c√¢u ({percentage:.1f}%)")

        # Ki·ªÉm tra errors
        errors = results_df[results_df["error"] != ""]
        if len(errors) > 0:
            print(f"\n‚ö†Ô∏è  C√≥ {len(errors)} c√¢u b·ªã l·ªói:")
            for idx, row in errors.iterrows():
                print(f"  ‚Ä¢ C√¢u {row['question_number']}: {row['error'][:50]}")

    except Exception as e:
        print(f"‚ùå L·ªói khi l∆∞u file: {e}")

In [11]:
def display_summary(results_df: pd.DataFrame):
    """Hi·ªÉn th·ªã t√≥m t·∫Øt k·∫øt qu·∫£."""
    print_header("üìä T√ìM T·∫ÆT K·∫æT QU·∫¢")

    total = len(results_df)
    successful = len(results_df[results_df["error"] == ""])

    print(f"\n‚úÖ T·ªïng s·ªë c√¢u: {total}")
    print(f"‚úÖ Tr·∫£ l·ªùi th√†nh c√¥ng: {successful}")
    print(f"‚ùå L·ªói: {total - successful}")

    # Top 5 c√¢u c√≥ confidence cao
    high_conf = results_df[results_df["confidence"] == "high"].head(5)
    if len(high_conf) > 0:
        print(f"\nüü¢ Top {len(high_conf)} c√¢u c√≥ ƒë·ªô tin c·∫≠y cao:")
        for idx, row in high_conf.iterrows():
            print(
                f"  {row['question_number']}. ƒê√°p √°n {row['predicted_answer']}: {row['question'][:60]}..."
            )

    # C√¢u c√≥ confidence th·∫•p
    low_conf = results_df[results_df["confidence"] == "low"]
    if len(low_conf) > 0:
        print(f"\nüî¥ {len(low_conf)} c√¢u c√≥ ƒë·ªô tin c·∫≠y th·∫•p:")
        for idx, row in low_conf.head(3).iterrows():
            print(f"  {row['question_number']}. {row['question'][:60]}...")

    print("\n" + "=" * 80)

In [12]:
def main():
    """H√†m ch√≠nh ƒë·ªÉ ch·∫°y to√†n b·ªô pipeline."""

    print_header("üöÄ RAG MCQ SYSTEM - AUTOMATIC ANSWERING")
    print(f"\nüìÖ B·∫Øt ƒë·∫ßu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Step 0: Check system health
    if not check_system_health():
        print("\n‚ùå H·ªá th·ªëng ch∆∞a s·∫µn s√†ng. Vui l√≤ng kh·ªüi ƒë·ªông services tr∆∞·ªõc!")
        return

    # Step 1: Check PDF files
    pdf_files = check_pdf_files()
    if not pdf_files:
        print("\n‚ö†Ô∏è  Kh√¥ng c√≥ PDF n√†o ƒë·ªÉ x·ª≠ l√Ω!")
        proceed = input("B·∫°n c√≥ mu·ªën ti·∫øp t·ª•c tr·∫£ l·ªùi c√¢u h·ªèi kh√¥ng? (y/n): ")
        if proceed.lower() != "y":
            return

    # Step 2: Process PDFs (manual trigger)
    if pdf_files:
        trigger_pdf_processing()

    # Step 3: Check indexed documents
    doc_count = check_indexed_documents()
    if doc_count == 0:
        print("\n‚ùå Kh√¥ng c√≥ documents n√†o trong h·ªá th·ªëng!")
        print("Kh√¥ng th·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi. Vui l√≤ng x·ª≠ l√Ω PDF tr∆∞·ªõc!")
        return

    # Step 4: Load questions
    questions_df = load_questions()
    if questions_df is None:
        return

    # Step 5: Answer all questions
    results_df = answer_all_questions(questions_df)

    # Step 6: Save results
    save_results(results_df)

    # Display summary
    display_summary(results_df)

    print_header("‚úÖ HO√ÄN T·∫§T!")
    print(f"\nüìÅ K·∫øt qu·∫£ ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {OUTPUT_FILE.absolute()}")
    print(f"üìÖ K·∫øt th√∫c: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("\n" + "=" * 80 + "\n")

In [13]:
# if __name__ == "__main__":
#     try:
#         main()
#     except KeyboardInterrupt:
#         print("\n\n‚ö†Ô∏è  ƒê√£ d·ª´ng b·ªüi ng∆∞·ªùi d√πng (Ctrl+C)")
#     except Exception as e:
#         print(f"\n\n‚ùå L·ªói kh√¥ng mong ƒë·ª£i: {e}")
#         import traceback

#         traceback.print_exc()

In [14]:
from pathlib import Path
from src.config import get_settings

settings = get_settings()
pdf_dir = Path(settings.data.pdf_dir)

pdf_files = list(pdf_dir.glob("*.pdf"))
print("PDF files found:", [f.name for f in pdf_files])

PDF files found: ['Public_251.pdf', 'Public_272.pdf', 'Public_264.pdf', 'Public_260.pdf', 'Public_061.pdf']


In [15]:
from src.services.factories import make_pdf_parser
from src.db.session import get_db_context
from src.models.document import Document

parser = make_pdf_parser()

with get_db_context() as db:
    for pdf_path in pdf_files:
        parsed = parser.parse_pdf(pdf_path)
        doc_id = Path(pdf_path).stem
        # T·∫°o object Document th·ª≠ m√† ch∆∞a commit
        doc = Document(
            doc_id=doc_id,
            filename=parsed["metadata"]["file_name"],
            file_path=pdf_path,
            title=parsed["title"],
            full_text=parsed["full_text"],
            raw_content=parsed["full_text"],
            page_count=parsed["metadata"]["page_count"],
            sections=parsed["sections"],
            tables=parsed["tables"],
            doc_metadata=parsed["metadata"],
            source_folder=Path(pdf_path).parent.name,
            processing_status="completed",
        )
        print(f"Parsed document: {doc_id}, title: {doc.title[:50]}...")

  from .autonotebook import tqdm as notebook_tqdm
2025-10-31 00:48:09,149 - INFO - Parsing PDF: data/pdf/Public_251.pdf
2025-10-31 00:48:09,151 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-31 00:48:09,195 - INFO - Going to convert document batch...
2025-10-31 00:48:09,195 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 75463f421d05cb4304e1f714cf00d35d
2025-10-31 00:48:09,200 - INFO - Loading plugin 'docling_defaults'
2025-10-31 00:48:09,202 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-31 00:48:09,207 - INFO - Loading plugin 'docling_defaults'
2025-10-31 00:48:09,210 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-31 00:48:09,214 - INFO - Accelerator device: 'mps'
2025-10-31 00:48:11,594 - INFO - Accelerator device: 'mps'
2025-10-31 00:48:12,218 - INFO - Processing document Public_251.pdf
2025-10-31 00:48:15,147 - INFO - Finished converting document Public_251.pdf i

Parsed document: Public_251, title: Public_251...


2025-10-31 00:48:19,141 - INFO - Finished converting document Public_272.pdf in 3.76 sec.
2025-10-31 00:48:19,150 - INFO - Successfully parsed PDF: Public_272, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_272', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2132697822789820610, filename='Public_272.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref='#/texts/12'), RefItem(cref

Parsed document: Public_272, title: Public_272...


2025-10-31 00:48:21,283 - INFO - Finished converting document Public_264.pdf in 2.13 sec.
2025-10-31 00:48:21,293 - INFO - Successfully parsed PDF: Public_264, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_264', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=5119675035389586591, filename='Public_264.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref=

Parsed document: Public_264, title: Public_264...


2025-10-31 00:48:23,864 - INFO - Finished converting document Public_260.pdf in 2.57 sec.
2025-10-31 00:48:23,875 - INFO - Successfully parsed PDF: Public_260, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_260', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11617410907304067118, filename='Public_260.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/tables/0'), RefItem(cref='#/texts/5'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/7'), RefItem(cref='#/pictures/1'), RefItem(cref='#/texts/8'), RefItem(cr

Parsed document: Public_260, title: Public_260...


2025-10-31 00:48:24,992 - INFO - Finished converting document Public_061.pdf in 1.12 sec.
2025-10-31 00:48:24,998 - INFO - Successfully parsed PDF: Public_061, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_061', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11381279440607448659, filename='Public_061.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref=

Parsed document: Public_061, title: Public_061...


In [16]:
from pprint import pprint

for pdf_path in pdf_files:
    parsed = parser.parse_pdf(pdf_path)
    pprint(parsed)  # In ƒë·∫πp to√†n b·ªô dict

2025-10-31 00:48:25,017 - INFO - Parsing PDF: data/pdf/Public_251.pdf
2025-10-31 00:48:25,018 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-31 00:48:25,022 - INFO - Going to convert document batch...
2025-10-31 00:48:25,022 - INFO - Processing document Public_251.pdf
2025-10-31 00:48:27,111 - INFO - Finished converting document Public_251.pdf in 2.09 sec.
2025-10-31 00:48:27,117 - INFO - Successfully parsed PDF: Public_251, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_251', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=828940394524416805, filename='Public_251.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), Ref

{'full_text': '<!-- image -->\n'
              '\n'
              '## VIETTEL AI RACE\n'
              '\n'
              '## ACK Flood Attack l√† g√¨? ƒêi·ªÉm kh√°c bi·ªát g√¨ so v·ªõi c√°c lo·∫°i '
              't·∫•n c√¥ng DDoS kh√°c?\n'
              '\n'
              'Public 251\n'
              '\n'
              'L ·∫ß n ban h√†nh: 1\n'
              '\n'
              'C√°c cu·ªôc t·∫•n c√¥ng DDoS (Distributed Denial of Service) ng√†y '
              'c√†ng tr·ªü n√™n ph·ªï bi·∫øn v√† tinh vi, g√¢y ra nhi·ªÅu thi·ªát h·∫°i nghi√™m '
              'tr·ªçng cho h·ªá th·ªëng m·∫°ng v√† d·ªãch v·ª• tr·ª±c tuy·∫øn. M·ªôt trong nh·ªØng '
              'h√¨nh th·ª©c t·∫•n c√¥ng DDoS ƒë·∫∑c bi·ªát nguy hi·ªÉm l√† t·∫•n c√¥ng ACK '
              'Flood. V·∫≠y ACK Flood Attack l√† g√¨ v√† ƒëi·ªÉm kh√°c bi·ªát c·ªßa n√≥ so '
              'v·ªõi c√°c lo·∫°i t·∫•n c√¥ng DDoS kh√°c ra sao?\n'
              '\n'
              '## 1. ACK Flood Attack l√† g√¨?\n'
              '\n'
         

2025-10-31 00:48:30,111 - INFO - Finished converting document Public_272.pdf in 2.98 sec.
2025-10-31 00:48:30,117 - INFO - Successfully parsed PDF: Public_272, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_272', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2132697822789820610, filename='Public_272.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref='#/texts/12'), RefItem(cref

{'full_text': '<!-- image -->\n'
              '\n'
              '## VIETTEL AI RACE\n'
              '\n'
              '## C√°c nh√≥m ki ·ªÉ m so√°t\n'
              '\n'
              '## 1. C√°c nh√≥m ki ·ªÉ m so√°t\n'
              '\n'
              'Trong an to√†n th√¥ng tin, c√°c bi·ªán ph√°p ki·ªÉm so√°t ( controls ) '
              'ƒë∆∞·ª£c chia th√†nh ba nh√≥m ch√≠nh:\n'
              '\n'
              '- Ki·ªÉm so√°t h√†nh ch√≠nh/Qu·∫£n l√Ω (Administrative/Managerial '
              'controls)\n'
              '- Ki·ªÉm so√°t k·ªπ thu·∫≠t (Technical controls)\n'
              '- Ki·ªÉm so√°t v·∫≠t l√Ω/V·∫≠n h√†nh (Physical/Operational controls)\n'
              '\n'
              '## 1.1 Ki ·ªÉ m so√°t h√†nh ch√≠nh/Qu ·∫£ n l√Ω\n'
              '\n'
              'Nh√≥m n√†y x·ª≠ l√Ω y·∫øu t·ªë con ng∆∞·ªùi trong an to√†n th√¥ng tin. Bao '
              'g·ªìm ch√≠nh s√°ch v√† quy tr√¨nh quy ƒë·ªãnh c√°ch t·ªï ch·ª©c qu·∫£n l√Ω d·ªØ '
              'li·ªáu, c≈©ng nh∆∞ tr

2025-10-31 00:48:31,840 - INFO - Finished converting document Public_264.pdf in 1.71 sec.
2025-10-31 00:48:31,853 - INFO - Successfully parsed PDF: Public_264, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_264', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=5119675035389586591, filename='Public_264.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref='#/texts/11'), RefItem(cref=

{'full_text': '<!-- image -->\n'
              '\n'
              'Public 264\n'
              '\n'
              'L ·∫ß n ban h√†nh: 1\n'
              '\n'
              '## VIETTEL AI RACE\n'
              '\n'
              '## LOGIC BOMD L√Ä G√å? C√ÅCH PH√íNG CH ·ªê NG LOGIC BOMB HI ·ªÜ U QU ·∫¢\n'
              '\n'
              'Logic bomb l√† m·ªôt lo·∫°i m√£ ƒë·ªôc c√≥ th·ªÉ b·ªã k√≠ch ho·∫°t khi nh·ªØng '
              'ƒëi·ªÅu ki·ªán nh·∫•t ƒë·ªãnh ƒë∆∞·ª£c th·ªèa m√£n. ƒê√¢y l√† m·ªôt trong nh·ªØng m·ªëi '
              'ƒëe d·ªça l·ªõn ƒë·ªëi v·ªõi an ninh th√¥ng tin, ƒë·∫∑c bi·ªát khi ch√∫ng c√≥ kh·∫£ '
              'nƒÉng g√¢y ra thi·ªát h·∫°i nghi√™m tr·ªçng cho h·ªá th·ªëng m√°y t√≠nh v√† d·ªØ '
              'li·ªáu.\n'
              '\n'
              '## 1. Logic bomb l√† g√¨?\n'
              '\n'
              'Logic bomb l√† m·ªôt ƒëo·∫°n m√£ ƒë·ªôc h·∫°i ƒë∆∞·ª£c nh√∫ng v√†o trong m·ªôt ·ª©ng '
              'd·ª•ng ho·∫∑c ch∆∞∆°ng tr√¨nh ph·∫ßn m·ªÅm. Khi nh·

2025-10-31 00:48:34,260 - INFO - Finished converting document Public_260.pdf in 2.40 sec.
2025-10-31 00:48:34,269 - INFO - Successfully parsed PDF: Public_260, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_260', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11617410907304067118, filename='Public_260.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/tables/0'), RefItem(cref='#/texts/5'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/7'), RefItem(cref='#/pictures/1'), RefItem(cref='#/texts/8'), RefItem(cr

{'full_text': '<!-- image -->\n'
              '\n'
              '## VIETTEL AI RACE\n'
              '\n'
              '## Ph√°t tri ·ªÉn ch√≠nh s√°ch trong ƒë·∫£ m b ·∫£ o th√¥ng tin\n'
              '\n'
              '## 1. C√°c y ·∫ø u t ·ªë c ·ªß a ch√≠nh s√°ch nh ·∫≠ n th ·ª©c &amp; ƒë√†o t·∫° '
              'o\n'
              '\n'
              '## 1.1 T ·ªï ng quan\n'
              '\n'
              'ƒê·ªëi v·ªõi m·ªói r·ªßi ro v√† m·ªëi ƒëe d·ªça ƒë√£ x√°c ƒë·ªãnh trong Mi·ªÅn Ng∆∞·ªùi '
              'D√πng v√† Mi·ªÅn M√°y Tr·∫°m, h√£y x√°c ƒë·ªãnh m·ªôt ki·ªÉm so√°t b·∫£o m·∫≠t ho·∫∑c '
              'bi·ªán ph√°p ƒë·ªëi ph√≥ b·∫£o m·∫≠t c√≥ th·ªÉ gi√∫p gi·∫£m thi·ªÉu r·ªßi ro ho·∫∑c '
              'm·ªëi ƒëe d·ªça.\n'
              '\n'
              '| Domain             | R·ªßi ro &M·ªëi ƒëe '
              'd·ªça                                                                                                 '
              '| Chi·∫øn l∆∞·ª£c/T√†i li·ªáu Gi·∫£m thi

2025-10-31 00:48:35,229 - INFO - Finished converting document Public_061.pdf in 0.95 sec.
2025-10-31 00:48:35,235 - INFO - Successfully parsed PDF: Public_061, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_061', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11381279440607448659, filename='Public_061.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(self_ref='#/body', parent=None, children=[RefItem(cref='#/pictures/0'), RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/groups/0'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5'), RefItem(cref='#/texts/6'), RefItem(cref='#/texts/7'), RefItem(cref='#/texts/8'), RefItem(cref='#/texts/9'), RefItem(cref='#/texts/10'), RefItem(cref=

{'full_text': '<!-- image -->\n'
              '\n'
              '## VIETTEL AI RACE\n'
              '\n'
              '## KH·∫¢O S√ÅT ƒê·∫∂C T·∫¢ V√Ä M√É NGU·ªíN\n'
              '\n'
              'TD061\n'
              '\n'
              'L·∫ßn ban h√†nh: 1\n'
              '\n'
              'Ch∆∞∆°ng n√†y gi·ªõi thi·ªáu c√°c k·ªπ thu·∫≠t kh·∫£o s√°t ƒë·∫∑c t·∫£ v√† m√£ ngu·ªìn. '
              'M√£ ngu·ªìn ƒë∆∞·ª£c ph√°t tri·ªÉn d·ª±a tr√™n ƒë·∫∑c t·∫£ v√† v√¨ th·∫ø vi·ªác kh·∫£o '
              's√°t ƒë·∫∑c t·∫£ c·∫ßn ƒë∆∞·ª£c ti·∫øn h√†nh tr∆∞·ªõc khi ph√°t tri·ªÉn m√£ ngu·ªìn ƒë·ªÉ '
              'tr√°nh c√°c r·ªßi ro v·ªÅ c√°c l·ªói c√≥ th·ªÉ c√≥ trong ƒë·∫∑c t·∫£ v·ªÅ s·∫£n ph·∫©m '
              'ph·∫ßn m·ªÅm. V√¨ ƒë·∫∑c t·∫£ kh√¥ng th·ªÉ th·ª±c thi ƒë∆∞·ª£c tr√™n m√°y n√™n ch√∫ng '
              'ta ch·ªâ c√≥ th·ªÉ ph√°t hi·ªán c√°c l·ªói b·∫±ng c√°c k·ªπ thu·∫≠t kh·∫£o s√°t ƒë·∫∑c '
              't·∫£. Tuy nhi√™n, m√£ ngu·ªìn th√¨ c√≥ th·ªÉ th·ª±c thi ƒë∆∞·ª£c v√† c√

In [17]:
from src.services.factories import make_text_chunker

chunker = make_text_chunker()
for doc in db.query(Document).all():
    doc_data = {
        "title": doc.title,
        "full_text": doc.full_text,
        "sections": doc.sections or [],
    }
    chunks = chunker.chunk_document(doc_data, doc.doc_id)
    print(f"Document {doc.doc_id} -> {len(chunks)} chunks")


2025-10-31 00:48:35,775 INFO sqlalchemy.engine.Engine select pg_catalog.version()


2025-10-31 00:48:35,775 - INFO - select pg_catalog.version()


2025-10-31 00:48:35,776 INFO sqlalchemy.engine.Engine [raw sql] {}


2025-10-31 00:48:35,776 - INFO - [raw sql] {}


2025-10-31 00:48:35,778 INFO sqlalchemy.engine.Engine select current_schema()


2025-10-31 00:48:35,778 - INFO - select current_schema()


2025-10-31 00:48:35,779 INFO sqlalchemy.engine.Engine [raw sql] {}


2025-10-31 00:48:35,779 - INFO - [raw sql] {}


2025-10-31 00:48:35,783 INFO sqlalchemy.engine.Engine show standard_conforming_strings


2025-10-31 00:48:35,783 - INFO - show standard_conforming_strings


2025-10-31 00:48:35,784 INFO sqlalchemy.engine.Engine [raw sql] {}


2025-10-31 00:48:35,784 - INFO - [raw sql] {}


2025-10-31 00:48:35,785 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-31 00:48:35,785 - INFO - BEGIN (implicit)


2025-10-31 00:48:35,788 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents


2025-10-31 00:48:35,788 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents


2025-10-31 00:48:35,789 INFO sqlalchemy.engine.Engine [generated in 0.00083s] {}


2025-10-31 00:48:35,789 - INFO - [generated in 0.00083s] {}


ProgrammingError: (psycopg2.errors.UndefinedTable) relation "documents" does not exist
LINE 2: FROM documents
             ^

[SQL: SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
from src.services.factories import make_embeddings_service
import asyncio

emb_service = make_embeddings_service()
texts = [chunk["chunk_text"] for chunk in chunks]
embeddings = asyncio.run(emb_service.embed_texts(texts))
print(f"Generated {len(embeddings)} embeddings, first vector len = {len(embeddings[0])}")


In [None]:
from src.services.factories import make_opensearch_client
import asyncio

opensearch = make_opensearch_client()
asyncio.run(opensearch.create_index(dimensions=settings.embeddings.dimensions))

# Index th·ª≠ 1 chunk
doc_to_index = {
    "chunk_id": chunks[0]["chunk_id"],
    "document_id": "test_doc",
    "chunk_text": chunks[0]["chunk_text"],
    "embedding": embeddings[0].tolist(),
}
result = asyncio.run(opensearch.index_chunks_bulk([doc_to_index]))
print("Index result:", result)
