docker exec rag-mcq-airflow airflow dags unpause pdf_ingestion_dag

docker exec rag-mcq-airflow airflow dags trigger pdf_ingestion_dag

In [1]:
import os

# Ghi đè connection string để dùng localhost thay vì postgres
os.environ['POSTGRES_HOST'] = 'localhost'

In [16]:
!curl -X DELETE "http://localhost:9200/mcq-documents" -u admin:admin

{"acknowledged":true}

### Bước 0

In [17]:
from src.models.document import Base 
from src.db.session import engine

In [24]:
from sqlalchemy import text

with engine.connect() as conn:
    conn.execute(text("""
        DO
        $$
        DECLARE
            r RECORD;
        BEGIN
            FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
                EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
            END LOOP;
        END
        $$;
    """))
    conn.commit()

print("✅ Dropped all tables successfully — fresh start!")

2025-11-02 22:01:15,834 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-02 22:01:15,834 - INFO - BEGIN (implicit)


2025-11-02 22:01:15,834 INFO sqlalchemy.engine.Engine 
        DO
        $$
        DECLARE
            r RECORD;
        BEGIN
            FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
                EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
            END LOOP;
        END
        $$;
    


2025-11-02 22:01:15,834 - INFO - 
        DO
        $$
        DECLARE
            r RECORD;
        BEGIN
            FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
                EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
            END LOOP;
        END
        $$;
    


2025-11-02 22:01:15,835 INFO sqlalchemy.engine.Engine [cached since 3.989e+04s ago] {}


2025-11-02 22:01:15,835 - INFO - [cached since 3.989e+04s ago] {}


2025-11-02 22:01:15,837 INFO sqlalchemy.engine.Engine COMMIT


2025-11-02 22:01:15,837 - INFO - COMMIT


✅ Dropped all tables successfully — fresh start!


In [5]:
Base.metadata.create_all(bind=engine)

2025-11-02 10:56:23,334 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-11-02 10:56:23,336 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-11-02 10:56:23,337 INFO sqlalchemy.engine.Engine [generated in 0.00039s] {'table_name': 'documents', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2025-11-02 10:56:23,338 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg

### Bước 1

In [6]:
from pathlib import Path

In [7]:
API_BASE_URL = "http://localhost:8000"
AIRFLOW_URL = "http://localhost:8080"

# Host & port các service
POSTGRES_HOST = "localhost"
POSTGRES_PORT = 5432          # hoặc 5433 nếu muốn connect airflow-db
REDIS_HOST = "localhost"
REDIS_PORT = 6379
OPENSEARCH_HOST = "http://localhost:9200"
OLLAMA_HOST = "http://localhost:11434"

# Cấu hình RAG
TOP_K = 5
USE_HYBRID = True
TIMEOUT = 60

In [8]:
from src.config import get_settings

settings = get_settings()
pdf_dir = Path(settings.data.pdf_dir)

pdf_files = list(pdf_dir.glob("*.pdf"))
pdf_paths = [str(f) for f in pdf_files]

print(f"Found {len(pdf_paths)} PDF files:")
pdf_paths

Found 5 PDF files:


['data/pdf/Public_251.pdf',
 'data/pdf/Public_272.pdf',
 'data/pdf/Public_264.pdf',
 'data/pdf/Public_260.pdf',
 'data/pdf/Public_061.pdf']

### Bước 2 

In [9]:
def sanitize_metadata(metadata: dict):
    safe = {}
    for k, v in metadata.items():
        if callable(v):
            try:
                safe[k] = v()  # gọi method nếu cần
            except Exception:
                safe[k] = None
        else:
            safe[k] = v
    return safe

In [10]:
from src.db.session import get_db_context
from src.models.document import Document
from src.services.factories import make_pdf_parser
from pathlib import Path

def extract_pdfs_debug(pdf_paths: list):
    parser = make_pdf_parser()
    processed = 0
    errors = 0
    doc_ids = []

    with get_db_context() as db:
        for pdf_path in pdf_paths:
            doc_id = Path(pdf_path).stem
            try:
                print(f"\n➡ Processing PDF: {pdf_path} (doc_id={doc_id})")

                existing = db.query(Document).filter(Document.doc_id == doc_id).first()
                if existing:
                    print(f"⏩ Document {doc_id} already exists, skipping")
                    continue

                parsed = parser.parse_pdf(pdf_path)
                safe_metadata = sanitize_metadata(parsed["metadata"])

                document = Document(
                    doc_id=doc_id,
                    filename=safe_metadata.get("file_name", ""),
                    file_path=pdf_path,
                    title=parsed.get("title", ""),
                    full_text=parsed.get("full_text", ""),
                    raw_content=parsed.get("full_text", ""),
                    page_count=safe_metadata.get("page_count", 0),
                    sections=parsed.get("sections", {}),
                    tables=parsed.get("tables", {}),
                    doc_metadata=safe_metadata,
                    source_folder=Path(pdf_path).parent.name,
                    processing_status="completed",
                )

                db.add(document)
                # commit sẽ được handle bởi get_db_context()
                doc_ids.append(doc_id)
                processed += 1
                print(f"✅ Successfully processed: {doc_id}")

            except Exception as e:
                print(f"❌ Error processing {pdf_path}: {e}")
                errors += 1
                continue

    print(f"\nSummary: processed={processed}, errors={errors}, doc_ids={doc_ids}")
    return {"processed": processed, "errors": errors, "doc_ids": doc_ids}

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
extract_result = extract_pdfs_debug(pdf_paths)


➡ Processing PDF: data/pdf/Public_251.pdf (doc_id=Public_251)
2025-11-02 10:56:28,030 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-02 10:56:28,030 - INFO - BEGIN (implicit)


2025-11-02 10:56:28,031 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:28,031 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:28,032 INFO sqlalchemy.engine.Engine [generated in 0.00042s] {'doc_id_1': 'Public_251', 'param_1': 1}


2025-11-02 10:56:28,032 - INFO - [generated in 0.00042s] {'doc_id_1': 'Public_251', 'param_1': 1}
2025-11-02 10:56:28,034 - INFO - Parsing PDF: data/pdf/Public_251.pdf
2025-11-02 10:56:28,035 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-02 10:56:28,072 - INFO - Going to convert document batch...
2025-11-02 10:56:28,072 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 75463f421d05cb4304e1f714cf00d35d
2025-11-02 10:56:28,077 - INFO - Loading plugin 'docling_defaults'
2025-11-02 10:56:28,078 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-02 10:56:28,085 - INFO - Loading plugin 'docling_defaults'
2025-11-02 10:56:28,087 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-02 10:56:28,090 - INFO - Accelerator device: 'mps'
2025-11-02 10:56:30,344 - INFO - Accelerator device: 'mps'
2025-11-02 10:56:30,910 - INFO - Processing document Public_251.pdf
2025-11-02 10:56:33,286 - INFO

✅ Successfully processed: Public_251

➡ Processing PDF: data/pdf/Public_272.pdf (doc_id=Public_272)
2025-11-02 10:56:33,308 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents

2025-11-02 10:56:33,308 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:33,309 INFO sqlalchemy.engine.Engine [cached since 5.278s ago] {'doc_id_1': 'Public_272', 'param_1': 1}


2025-11-02 10:56:33,309 - INFO - [cached since 5.278s ago] {'doc_id_1': 'Public_272', 'param_1': 1}
2025-11-02 10:56:33,313 - INFO - Parsing PDF: data/pdf/Public_272.pdf
2025-11-02 10:56:33,315 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-02 10:56:33,317 - INFO - Going to convert document batch...
2025-11-02 10:56:33,317 - INFO - Processing document Public_272.pdf
2025-11-02 10:56:36,961 - INFO - Finished converting document Public_272.pdf in 3.65 sec.
2025-11-02 10:56:36,968 - INFO - Successfully parsed PDF: Public_272, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_272', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=2132697822789820610, filename='Public_272.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(s

✅ Successfully processed: Public_272

➡ Processing PDF: data/pdf/Public_264.pdf (doc_id=Public_264)
2025-11-02 10:56:36,969 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents

2025-11-02 10:56:36,969 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:36,970 INFO sqlalchemy.engine.Engine [cached since 8.939s ago] {'doc_id_1': 'Public_264', 'param_1': 1}


2025-11-02 10:56:36,970 - INFO - [cached since 8.939s ago] {'doc_id_1': 'Public_264', 'param_1': 1}
2025-11-02 10:56:36,974 - INFO - Parsing PDF: data/pdf/Public_264.pdf
2025-11-02 10:56:36,975 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-02 10:56:36,978 - INFO - Going to convert document batch...
2025-11-02 10:56:36,978 - INFO - Processing document Public_264.pdf
2025-11-02 10:56:38,770 - INFO - Finished converting document Public_264.pdf in 1.80 sec.
2025-11-02 10:56:38,782 - INFO - Successfully parsed PDF: Public_264, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_264', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=5119675035389586591, filename='Public_264.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(s

✅ Successfully processed: Public_264

➡ Processing PDF: data/pdf/Public_260.pdf (doc_id=Public_260)
2025-11-02 10:56:38,788 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents

2025-11-02 10:56:38,788 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:38,788 INFO sqlalchemy.engine.Engine [cached since 10.76s ago] {'doc_id_1': 'Public_260', 'param_1': 1}


2025-11-02 10:56:38,788 - INFO - [cached since 10.76s ago] {'doc_id_1': 'Public_260', 'param_1': 1}
2025-11-02 10:56:38,791 - INFO - Parsing PDF: data/pdf/Public_260.pdf
2025-11-02 10:56:38,792 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-02 10:56:38,795 - INFO - Going to convert document batch...
2025-11-02 10:56:38,795 - INFO - Processing document Public_260.pdf
2025-11-02 10:56:41,100 - INFO - Finished converting document Public_260.pdf in 2.31 sec.
2025-11-02 10:56:41,109 - INFO - Successfully parsed PDF: Public_260, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_260', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11617410907304067118, filename='Public_260.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(

✅ Successfully processed: Public_260

➡ Processing PDF: data/pdf/Public_061.pdf (doc_id=Public_061)
2025-11-02 10:56:41,111 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents

2025-11-02 10:56:41,111 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:41,111 INFO sqlalchemy.engine.Engine [cached since 13.08s ago] {'doc_id_1': 'Public_061', 'param_1': 1}


2025-11-02 10:56:41,111 - INFO - [cached since 13.08s ago] {'doc_id_1': 'Public_061', 'param_1': 1}
2025-11-02 10:56:41,113 - INFO - Parsing PDF: data/pdf/Public_061.pdf
2025-11-02 10:56:41,114 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-02 10:56:41,116 - INFO - Going to convert document batch...
2025-11-02 10:56:41,117 - INFO - Processing document Public_061.pdf
2025-11-02 10:56:42,135 - INFO - Finished converting document Public_061.pdf in 1.02 sec.
2025-11-02 10:56:42,140 - INFO - Successfully parsed PDF: Public_061, pages: <bound method DoclingDocument.num_pages of DoclingDocument(schema_name='DoclingDocument', version='1.7.0', name='Public_061', origin=DocumentOrigin(mimetype='application/pdf', binary_hash=11381279440607448659, filename='Public_061.pdf', uri=None), furniture=GroupItem(self_ref='#/furniture', parent=None, children=[], content_layer=<ContentLayer.FURNITURE: 'furniture'>, name='_root_', label=<GroupLabel.UNSPECIFIED: 'unspecified'>), body=GroupItem(

✅ Successfully processed: Public_061
2025-11-02 10:56:42,146 INFO sqlalchemy.engine.Engine INSERT INTO documents (doc_id, filename, file_path, title, full_text, raw_content, page_count, file_size, sections, tables, doc_metadata, source_folder, processing_status, error_message, created_at, updated_at, processed_at) SELECT p0::VARCHAR, p1::V ... 2046 characters truncated ...  p13, p14, p15, p16, sen_counter) ORDER BY sen_counter RETURNING documents.id, documents.id AS id__1


2025-11-02 10:56:42,146 - INFO - INSERT INTO documents (doc_id, filename, file_path, title, full_text, raw_content, page_count, file_size, sections, tables, doc_metadata, source_folder, processing_status, error_message, created_at, updated_at, processed_at) SELECT p0::VARCHAR, p1::V ... 2046 characters truncated ...  p13, p14, p15, p16, sen_counter) ORDER BY sen_counter RETURNING documents.id, documents.id AS id__1


2025-11-02 10:56:42,147 INFO sqlalchemy.engine.Engine [generated in 0.00055s (insertmanyvalues) 1/1 (ordered)] {'doc_id__0': 'Public_251', 'filename__0': 'Public_251.pdf', 'tables__0': '[{"table_id": "table_0", "data": [["table_cells", "[TableCell(bbox=BoundingBox(l=86.904, t=317.3159999999999, r=131.914, b=328.217, coord_origin=<Coo ... (14285 characters truncated) ... 01b0\\u1ee3ng\', column_header=False, row_header=False, row_section=False, fillable=False)]"], ["num_rows", "3"], ["num_cols", "6"]], "caption": ""}]', 'raw_content__0': '<!-- image -->\n\n## VIETTEL AI RACE\n\n## ACK Flood Attack là gì? Điểm khác biệt gì so với các loại tấn công DDoS khác?\n\nPublic 251\n\nL ầ n ban h ... (9400 characters truncated) ... c, doanh nghiệp chủ động hơn trong việc xây dựng các giải pháp phòng chống hiệu quả, bảo vệ hệ thống mạng trước các mối đe dọa ngày càng tinh vi này.', 'source_folder__0': 'pdf', 'file_size__0': 0, 'page_count__0': 5, 'file_path__0': 'data/pdf/Public_251.pdf', 'updated_

2025-11-02 10:56:42,147 - INFO - [generated in 0.00055s (insertmanyvalues) 1/1 (ordered)] {'doc_id__0': 'Public_251', 'filename__0': 'Public_251.pdf', 'tables__0': '[{"table_id": "table_0", "data": [["table_cells", "[TableCell(bbox=BoundingBox(l=86.904, t=317.3159999999999, r=131.914, b=328.217, coord_origin=<Coo ... (14285 characters truncated) ... 01b0\\u1ee3ng\', column_header=False, row_header=False, row_section=False, fillable=False)]"], ["num_rows", "3"], ["num_cols", "6"]], "caption": ""}]', 'raw_content__0': '<!-- image -->\n\n## VIETTEL AI RACE\n\n## ACK Flood Attack là gì? Điểm khác biệt gì so với các loại tấn công DDoS khác?\n\nPublic 251\n\nL ầ n ban h ... (9400 characters truncated) ... c, doanh nghiệp chủ động hơn trong việc xây dựng các giải pháp phòng chống hiệu quả, bảo vệ hệ thống mạng trước các mối đe dọa ngày càng tinh vi này.', 'source_folder__0': 'pdf', 'file_size__0': 0, 'page_count__0': 5, 'file_path__0': 'data/pdf/Public_251.pdf', 'updated_at__0': datetime.date

2025-11-02 10:56:42,182 INFO sqlalchemy.engine.Engine COMMIT


2025-11-02 10:56:42,182 - INFO - COMMIT



Summary: processed=5, errors=0, doc_ids=['Public_251', 'Public_272', 'Public_264', 'Public_260', 'Public_061']


### Bước 3

In [12]:
from src.db.session import get_db_context
from src.models.document import Document, DocumentChunk
from src.services.factories import make_text_chunker

def chunk_documents_debug(doc_ids: list = None):
    """
    Chunk documents from DB.
    
    Args:
        doc_ids: list of doc_id to process; if None, will process all documents in DB
    """
    chunker = make_text_chunker()
    total_chunks = 0
    processed = 0
    chunk_ids = []

    with get_db_context() as db:
        # Nếu không truyền doc_ids, lấy tất cả doc_id trong DB
        if not doc_ids:
            doc_ids = [doc.doc_id for doc in db.query(Document).all()]
            print(f"ℹ Found {len(doc_ids)} documents in DB to chunk")

        for doc_id in doc_ids:
            try:
                document = db.query(Document).filter(Document.doc_id == doc_id).first()
                if not document:
                    print(f"⏩ Document {doc_id} not found, skipping")
                    continue

                doc_data = {
                    "title": document.title,
                    "full_text": document.full_text,
                    "sections": document.sections or [],
                }

                chunks = chunker.chunk_document(doc_data, doc_id)

                if not chunks:
                    print(f"⚠ No chunks created for document {doc_id}")
                    continue

                for chunk in chunks:
                    db_chunk = DocumentChunk(
                        chunk_id=chunk["chunk_id"],
                        document_id=document.id,
                        document_file_name=document.doc_id, 
                        document_title=document.title,
                        chunk_text=chunk["chunk_text"],
                        chunk_index=chunk["chunk_index"],
                        section_name=chunk.get("section_name"),
                        chunk_type=chunk.get("chunk_type", "text"),
                        word_count=chunk.get("word_count", 0),
                        char_count=chunk.get("char_count", 0),
                        chunk_metadata=chunk,
                        embedding_status="pending",
                        indexed_in_opensearch="pending",
                    )
                    db.add(db_chunk)
                    chunk_ids.append(chunk["chunk_id"])

                processed += 1
                total_chunks += len(chunks)
                print(f"✅ Chunked document {doc_id}: {len(chunks)} chunks")

            except Exception as e:
                print(f"❌ Error chunking document {doc_id}: {e}")
                continue

    print(f"\nSummary: processed={processed}, chunks_created={total_chunks}, chunk_ids={len(chunk_ids)}")
    return {"processed": processed, "chunks_created": total_chunks, "chunk_ids": chunk_ids}

In [13]:
chunk_result = chunk_documents_debug()
print(chunk_result)

2025-11-02 10:56:42,213 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-02 10:56:42,213 - INFO - BEGIN (implicit)


2025-11-02 10:56:42,213 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents


2025-11-02 10:56:42,213 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents


2025-11-02 10:56:42,214 INFO sqlalchemy.engine.Engine [generated in 0.00045s] {}


2025-11-02 10:56:42,214 - INFO - [generated in 0.00045s] {}


ℹ Found 5 documents in DB to chunk
2025-11-02 10:56:42,218 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,218 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,219 INFO sqlalchemy.engine.Engine [cached since 14.19s ago] {'doc_id_1': 'Public_251', 'param_1': 1}


2025-11-02 10:56:42,219 - INFO - [cached since 14.19s ago] {'doc_id_1': 'Public_251', 'param_1': 1}
2025-11-02 10:56:42,221 - INFO - Document Public_251 chunked into 12 chunks from 19 sections


✅ Chunked document Public_251: 12 chunks
2025-11-02 10:56:42,221 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,221 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,222 INFO sqlalchemy.engine.Engine [cached since 14.19s ago] {'doc_id_1': 'Public_272', 'param_1': 1}


2025-11-02 10:56:42,222 - INFO - [cached since 14.19s ago] {'doc_id_1': 'Public_272', 'param_1': 1}
2025-11-02 10:56:42,223 - INFO - Document Public_272 chunked into 7 chunks from 10 sections


✅ Chunked document Public_272: 7 chunks
2025-11-02 10:56:42,224 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,224 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,224 INFO sqlalchemy.engine.Engine [cached since 14.19s ago] {'doc_id_1': 'Public_264', 'param_1': 1}


2025-11-02 10:56:42,224 - INFO - [cached since 14.19s ago] {'doc_id_1': 'Public_264', 'param_1': 1}
2025-11-02 10:56:42,226 - INFO - Document Public_264 chunked into 23 chunks from 28 sections


✅ Chunked document Public_264: 23 chunks
2025-11-02 10:56:42,226 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,226 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,227 INFO sqlalchemy.engine.Engine [cached since 14.2s ago] {'doc_id_1': 'Public_260', 'param_1': 1}


2025-11-02 10:56:42,227 - INFO - [cached since 14.2s ago] {'doc_id_1': 'Public_260', 'param_1': 1}
2025-11-02 10:56:42,229 - INFO - Document Public_260 chunked into 18 chunks from 29 sections


✅ Chunked document Public_260: 18 chunks
2025-11-02 10:56:42,229 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,229 - INFO - SELECT documents.id AS documents_id, documents.doc_id AS documents_doc_id, documents.filename AS documents_filename, documents.file_path AS documents_file_path, documents.title AS documents_title, documents.full_text AS documents_full_text, documents.raw_content AS documents_raw_content, documents.page_count AS documents_page_count, documents.file_size AS documents_file_size, documents.sections AS documents_sections, documents.tables AS documents_tables, documents.doc_metadata AS documents_doc_metadata, documents.source_folder AS documents_source_folder, documents.processing_status AS documents_processing_status, documents.error_message AS documents_error_message, documents.created_at AS documents_created_at, documents.updated_at AS documents_updated_at, documents.processed_at AS documents_processed_at 
FROM documents 
WHERE documents.doc_id = %(doc_id_1)s 
 LIMIT %(param_1)s


2025-11-02 10:56:42,230 INFO sqlalchemy.engine.Engine [cached since 14.2s ago] {'doc_id_1': 'Public_061', 'param_1': 1}


2025-11-02 10:56:42,230 - INFO - [cached since 14.2s ago] {'doc_id_1': 'Public_061', 'param_1': 1}
2025-11-02 10:56:42,233 - INFO - Document Public_061 chunked into 15 chunks from 21 sections


✅ Chunked document Public_061: 15 chunks
2025-11-02 10:56:42,236 INFO sqlalchemy.engine.Engine INSERT INTO document_chunks (chunk_id, document_id, document_file_name, document_title, chunk_text, chunk_index, section_name, chunk_type, word_count, char_count, start_char_index, end_char_index, embedding_status, indexed_in_opensearch, chunk_metada ... 29978 characters truncated ... 15, p16, sen_counter) ORDER BY sen_counter RETURNING document_chunks.id, document_chunks.id AS id__1


2025-11-02 10:56:42,236 - INFO - INSERT INTO document_chunks (chunk_id, document_id, document_file_name, document_title, chunk_text, chunk_index, section_name, chunk_type, word_count, char_count, start_char_index, end_char_index, embedding_status, indexed_in_opensearch, chunk_metada ... 29978 characters truncated ... 15, p16, sen_counter) ORDER BY sen_counter RETURNING document_chunks.id, document_chunks.id AS id__1


2025-11-02 10:56:42,236 INFO sqlalchemy.engine.Engine [generated in 0.00064s (insertmanyvalues) 1/1 (ordered)] {'indexed_in_opensearch__0': 'pending', 'embedding_status__0': 'pending', 'start_char_index__0': None, 'chunk_metadata__0': '{"chunk_text": "# Public_251\\n\\n## ACK Flood Attack l\\u00e0 g\\u00ec? \\u0110i\\u1ec3m kh\\u00e1c bi\\u1ec7t g\\u00ec so v\\u1edbi c\\u00e1c lo\\u ... (957 characters truncated) ... ord_count": 81, "chunk_type": "small_section", "chunk_id": "Public_251_chunk_0000", "chunk_index": 0, "document_id": "Public_251", "char_count": 476}', 'document_file_name__0': 'Public_251', 'chunk_text__0': '# Public_251\n\n## ACK Flood Attack là gì? Điểm khác biệt gì so với các loại tấn công DDoS khác?\n\nPublic 251\n\nL ầ n ban hành: 1\n\nCác cuộc tấn c ... (188 characters truncated) ... ông DDoS đặc biệt nguy hiểm là tấn công ACK Flood. Vậy ACK Flood Attack là gì và điểm khác biệt của nó so với các loại tấn công DDoS khác ra sao?\n\n', 'section_name__0': 'ACK Flood Att

2025-11-02 10:56:42,236 - INFO - [generated in 0.00064s (insertmanyvalues) 1/1 (ordered)] {'indexed_in_opensearch__0': 'pending', 'embedding_status__0': 'pending', 'start_char_index__0': None, 'chunk_metadata__0': '{"chunk_text": "# Public_251\\n\\n## ACK Flood Attack l\\u00e0 g\\u00ec? \\u0110i\\u1ec3m kh\\u00e1c bi\\u1ec7t g\\u00ec so v\\u1edbi c\\u00e1c lo\\u ... (957 characters truncated) ... ord_count": 81, "chunk_type": "small_section", "chunk_id": "Public_251_chunk_0000", "chunk_index": 0, "document_id": "Public_251", "char_count": 476}', 'document_file_name__0': 'Public_251', 'chunk_text__0': '# Public_251\n\n## ACK Flood Attack là gì? Điểm khác biệt gì so với các loại tấn công DDoS khác?\n\nPublic 251\n\nL ầ n ban hành: 1\n\nCác cuộc tấn c ... (188 characters truncated) ... ông DDoS đặc biệt nguy hiểm là tấn công ACK Flood. Vậy ACK Flood Attack là gì và điểm khác biệt của nó so với các loại tấn công DDoS khác ra sao?\n\n', 'section_name__0': 'ACK Flood Attack là gì? Điểm khác 

2025-11-02 10:56:42,260 INFO sqlalchemy.engine.Engine COMMIT


2025-11-02 10:56:42,260 - INFO - COMMIT



Summary: processed=5, chunks_created=75, chunk_ids=75
{'processed': 5, 'chunks_created': 75, 'chunk_ids': ['Public_251_chunk_0000', 'Public_251_chunk_0001', 'Public_251_chunk_0002', 'Public_251_chunk_0003', 'Public_251_chunk_0004', 'Public_251_chunk_0005', 'Public_251_chunk_0006', 'Public_251_chunk_0007', 'Public_251_chunk_0008', 'Public_251_chunk_0009', 'Public_251_chunk_0010', 'Public_251_chunk_0011', 'Public_272_chunk_0000', 'Public_272_chunk_0001', 'Public_272_chunk_0002', 'Public_272_chunk_0003', 'Public_272_chunk_0004', 'Public_272_chunk_0005', 'Public_272_chunk_0006', 'Public_264_chunk_0000', 'Public_264_chunk_0001', 'Public_264_chunk_0002', 'Public_264_chunk_0003', 'Public_264_chunk_0004', 'Public_264_chunk_0005', 'Public_264_chunk_0006', 'Public_264_chunk_0007', 'Public_264_chunk_0008', 'Public_264_chunk_0009', 'Public_264_chunk_0010', 'Public_264_chunk_0011', 'Public_264_chunk_0012', 'Public_264_chunk_0013', 'Public_264_chunk_0014', 'Public_264_chunk_0015', 'Public_264_chunk

### Bước 4

### Bước 5

In [None]:
import os
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
from src.services.factories import make_embeddings_service

# --- Lấy thông số từ môi trường ---
OPENSEARCH_URL = os.environ.get("OPENSEARCH_URL", "http://localhost:9200")
OPENSEARCH_INDEX = os.environ.get("OPENSEARCH_INDEX", "mcq-documents")
EMBEDDING_DIM = int(os.environ.get("EMBEDDING_DIM", 768))

# --- Tạo client OpenSearch ---
client = OpenSearch(OPENSEARCH_URL)

# --- Xóa index cũ nếu có ---
client.delete_index(index=OPENSEARCH_INDEX, ignore=[400, 404])

# --- Tạo index mới với mapping đúng cho OpenSearch k-NN ---
client.create_index(
    index=OPENSEARCH_INDEX,
    body={
        "settings": {
            "index": {
                "knn": True,  # Bật k-NN plugin
                "knn.algo_param.ef_search": 100
            }
        },
        "mappings": {
            "properties": {
                "chunk_id": {"type": "keyword"},
                "document_id": {"type": "keyword"},
                "chunk_text": {"type": "text"},
                "embedding": {
                    "type": "knn_vector",  # Đổi từ dense_vector sang knn_vector
                    "dimension": EMBEDDING_DIM,  # Đổi từ dims sang dimension
                    "method": {
                        "name": "hnsw",
                        "space_type": "l2",
                        "engine": "nmslib",
                        "parameters": {
                            "ef_construction": 128,
                            "m": 24
                        }
                    }
                },
                "chunk_index": {"type": "integer"},
                "section_name": {"type": "keyword"},
                "chunk_type": {"type": "keyword"},
            }
        }
    }
)

print(f"✅ Index '{OPENSEARCH_INDEX}' created successfully!")

2025-11-02 10:57:46,627 - INFO - DELETE http://localhost:9200/mcq-documents [status:404 request:0.018s]
2025-11-02 10:57:46,801 - INFO - PUT http://localhost:9200/mcq-documents [status:200 request:0.172s]


✅ Index 'mcq-documents' created successfully!


In [15]:
# --- Hàm generate embeddings và bulk index ---
async def generate_embeddings_and_index(chunk_ids: list):
    embeddings_service = make_embeddings_service()

    with get_db_context() as db:
        chunks = db.query(DocumentChunk).filter(DocumentChunk.chunk_id.in_(chunk_ids)).all()
        if not chunks:
            print("No chunks found")
            return {"processed": 0, "indexed": 0}

        texts = [chunk.chunk_text for chunk in chunks]
        embeddings = await embeddings_service.embed_texts(texts)

        bulk_ops = []
        for chunk, embedding in zip(chunks, embeddings):
            chunk.embedding_status = "completed"
            chunk.embedding = embedding.tolist()
            chunk.chunk_metadata = chunk.chunk_metadata or {}
            chunk.chunk_metadata["embedding"] = embedding.tolist()

            # --- Chuẩn bị dữ liệu index lên OpenSearch ---
            bulk_ops.append({
                "_op_type": "index",
                "_index": OPENSEARCH_INDEX,
                "_id": chunk.chunk_id,
                "_source": {
                    "chunk_id": chunk.chunk_id,
                    "document_id": chunk.document_id,
                    "document_file_name": chunk.document_file_name,
                    "document_title": chunk.document_title,
                    "chunk_index": chunk.chunk_index,
                    "chunk_text": chunk.chunk_text,
                    "section_name": chunk.section_name,
                    "chunk_type": chunk.chunk_type,
                    "embedding": chunk.embedding
                }
            })

        db.commit()  # commit embedding vào DB

        success, _ = bulk(client, bulk_ops, refresh=True)
        print(f"✅ Indexed {success}/{len(chunks)} chunks")

    return {"processed": len(chunks), "indexed": success}

In [16]:
embed_index_result = await generate_embeddings_and_index(chunk_result["chunk_ids"])
print(embed_index_result)

2025-11-01 00:22:12,499 - INFO - Loading embeddings model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
2025-11-01 00:22:12,508 - INFO - Use pytorch device_name: mps
2025-11-01 00:22:12,509 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
2025-11-01 00:22:19,971 - INFO - Embeddings model loaded: sentence-transformers/paraphrase-multilingual-mpnet-base-v2, dimensions: 768, device: cpu


2025-11-01 00:22:20,036 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-11-01 00:22:20,036 - INFO - BEGIN (implicit)


2025-11-01 00:22:20,041 INFO sqlalchemy.engine.Engine SELECT document_chunks.id AS document_chunks_id, document_chunks.chunk_id AS document_chunks_chunk_id, document_chunks.document_id AS document_chunks_document_id, document_chunks.chunk_text AS document_chunks_chunk_text, document_chunks.chunk_index AS document_chunks_chunk_index, document_chunks.section_name AS document_chunks_section_name, document_chunks.chunk_type AS document_chunks_chunk_type, document_chunks.word_count AS document_chunks_word_count, document_chunks.char_count AS document_chunks_char_count, document_chunks.start_char_index AS document_chunks_start_char_index, document_chunks.end_char_index AS document_chunks_end_char_index, document_chunks.embedding_status AS document_chunks_embedding_status, document_chunks.indexed_in_opensearch AS document_chunks_indexed_in_opensearch, document_chunks.chunk_metadata AS document_chunks_chunk_metadata, document_chunks.created_at AS document_chunks_created_at, document_chunks.upd

2025-11-01 00:22:20,041 - INFO - SELECT document_chunks.id AS document_chunks_id, document_chunks.chunk_id AS document_chunks_chunk_id, document_chunks.document_id AS document_chunks_document_id, document_chunks.chunk_text AS document_chunks_chunk_text, document_chunks.chunk_index AS document_chunks_chunk_index, document_chunks.section_name AS document_chunks_section_name, document_chunks.chunk_type AS document_chunks_chunk_type, document_chunks.word_count AS document_chunks_word_count, document_chunks.char_count AS document_chunks_char_count, document_chunks.start_char_index AS document_chunks_start_char_index, document_chunks.end_char_index AS document_chunks_end_char_index, document_chunks.embedding_status AS document_chunks_embedding_status, document_chunks.indexed_in_opensearch AS document_chunks_indexed_in_opensearch, document_chunks.chunk_metadata AS document_chunks_chunk_metadata, document_chunks.created_at AS document_chunks_created_at, document_chunks.updated_at AS document_c

2025-11-01 00:22:20,042 INFO sqlalchemy.engine.Engine [generated in 0.00137s] {'chunk_id_1_1': 'Public_251_chunk_0000', 'chunk_id_1_2': 'Public_251_chunk_0001', 'chunk_id_1_3': 'Public_251_chunk_0002', 'chunk_id_1_4': 'Public_251_chunk_0003', 'chunk_id_1_5': 'Public_251_chunk_0004', 'chunk_id_1_6': 'Public_251_chunk_0005', 'chunk_id_1_7': 'Public_251_chunk_0006', 'chunk_id_1_8': 'Public_251_chunk_0007', 'chunk_id_1_9': 'Public_251_chunk_0008', 'chunk_id_1_10': 'Public_251_chunk_0009', 'chunk_id_1_11': 'Public_251_chunk_0010', 'chunk_id_1_12': 'Public_251_chunk_0011', 'chunk_id_1_13': 'Public_272_chunk_0000', 'chunk_id_1_14': 'Public_272_chunk_0001', 'chunk_id_1_15': 'Public_272_chunk_0002', 'chunk_id_1_16': 'Public_272_chunk_0003', 'chunk_id_1_17': 'Public_272_chunk_0004', 'chunk_id_1_18': 'Public_272_chunk_0005', 'chunk_id_1_19': 'Public_272_chunk_0006', 'chunk_id_1_20': 'Public_264_chunk_0000', 'chunk_id_1_21': 'Public_264_chunk_0001', 'chunk_id_1_22': 'Public_264_chunk_0002', 'chunk

2025-11-01 00:22:20,042 - INFO - [generated in 0.00137s] {'chunk_id_1_1': 'Public_251_chunk_0000', 'chunk_id_1_2': 'Public_251_chunk_0001', 'chunk_id_1_3': 'Public_251_chunk_0002', 'chunk_id_1_4': 'Public_251_chunk_0003', 'chunk_id_1_5': 'Public_251_chunk_0004', 'chunk_id_1_6': 'Public_251_chunk_0005', 'chunk_id_1_7': 'Public_251_chunk_0006', 'chunk_id_1_8': 'Public_251_chunk_0007', 'chunk_id_1_9': 'Public_251_chunk_0008', 'chunk_id_1_10': 'Public_251_chunk_0009', 'chunk_id_1_11': 'Public_251_chunk_0010', 'chunk_id_1_12': 'Public_251_chunk_0011', 'chunk_id_1_13': 'Public_272_chunk_0000', 'chunk_id_1_14': 'Public_272_chunk_0001', 'chunk_id_1_15': 'Public_272_chunk_0002', 'chunk_id_1_16': 'Public_272_chunk_0003', 'chunk_id_1_17': 'Public_272_chunk_0004', 'chunk_id_1_18': 'Public_272_chunk_0005', 'chunk_id_1_19': 'Public_272_chunk_0006', 'chunk_id_1_20': 'Public_264_chunk_0000', 'chunk_id_1_21': 'Public_264_chunk_0001', 'chunk_id_1_22': 'Public_264_chunk_0002', 'chunk_id_1_23': 'Public_26

2025-11-01 00:22:24,887 INFO sqlalchemy.engine.Engine UPDATE document_chunks SET embedding_status=%(embedding_status)s, updated_at=%(updated_at)s WHERE document_chunks.id = %(document_chunks_id)s


2025-11-01 00:22:24,887 - INFO - UPDATE document_chunks SET embedding_status=%(embedding_status)s, updated_at=%(updated_at)s WHERE document_chunks.id = %(document_chunks_id)s


2025-11-01 00:22:24,889 INFO sqlalchemy.engine.Engine [generated in 0.00220s] [{'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887818), 'document_chunks_id': 1}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887839), 'document_chunks_id': 2}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887839), 'document_chunks_id': 3}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 4}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 5}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 6}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 7}, {'embedding_status': 'completed'

2025-11-01 00:22:24,889 - INFO - [generated in 0.00220s] [{'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887818), 'document_chunks_id': 1}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887839), 'document_chunks_id': 2}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887839), 'document_chunks_id': 3}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 4}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 5}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 6}, {'embedding_status': 'completed', 'updated_at': datetime.datetime(2025, 10, 31, 17, 22, 24, 887840), 'document_chunks_id': 7}, {'embedding_status': 'completed', 'updated_at': datet

2025-11-01 00:22:24,949 INFO sqlalchemy.engine.Engine COMMIT


2025-11-01 00:22:24,949 - INFO - COMMIT
2025-11-01 00:22:25,896 - INFO - POST http://localhost:9200/_bulk?refresh=true [status:200 request:0.912s]


✅ Indexed 75/75 chunks
{'processed': 75, 'indexed': 75}


### Bước 6

In [17]:
import requests
import pandas as pd
import time
import json
from pathlib import Path
from typing import Dict, Any, List
from datetime import datetime
import socket
from urllib.parse import urlparse

In [17]:
# Đường dẫn files
DATA_DIR = Path("data")
PDF_DIR = DATA_DIR / "pdf"
QUESTION_FILE = DATA_DIR / "q.csv"
OUTPUT_FILE = DATA_DIR / "answers_output.csv"

In [None]:
def print_header(text: str):
    print("\n" + "=" * 80)
    print(f"  {text}")
    print("=" * 80)

def print_step(step_num: int, text: str):
    print(f"\n{'─' * 80}")
    print(f"📌 BƯỚC {step_num}: {text}")
    print(f"{'─' * 80}")
    
def print_step(step_num: int, text: str):
    print(f"\n{'─' * 80}")
    print(f"📌 BƯỚC {step_num}: {text}")
    print(f"{'─' * 80}")

In [16]:
def check_indexed_documents() -> int:
    """Kiểm tra số lượng documents đã được index."""
    print_step(3, "Kiểm tra documents đã index")

    try:
        opensearch_auth = ("admin", "admin")
        response = requests.get(
            "http://localhost:9200/mcq-documents/_count",
            auth=opensearch_auth,
            verify=False,
            timeout=10,
        )

        if response.status_code == 200:
            count = response.json().get("count", 0)
            print(f"\n📄 Số lượng document chunks: {count}")

            if count > 0:
                print("✅ Hệ thống đã sẵn sàng để trả lời câu hỏi!")
            else:
                print("⚠️  Chưa có documents nào. Vui lòng chạy Airflow DAG trước!")

            return count
        else:
            print(f"❌ Lỗi khi query OpenSearch: {response.status_code}")
            return 0

    except Exception as e:
        print(f"❌ Không thể kết nối OpenSearch: {e}")
        return 0

In [None]:
def load_questions() -> pd.DataFrame:
    """Đọc câu hỏi từ CSV file."""
    print_step(4, "Đọc câu hỏi từ CSV")

    if not QUESTION_FILE.exists():
        print(f"\n❌ File không tồn tại: {QUESTION_FILE}")
        print("Vui lòng tạo file question.csv với format:")
        print("  Question,A,B,C,D,source_folder")
        return None

    try:
        df = pd.read_csv(QUESTION_FILE)
        print(f"\n✅ Đọc thành công {len(df)} câu hỏi")

        # Kiểm tra columns
        required_cols = ["Question", "A", "B", "C", "D"]
        missing_cols = [col for col in required_cols if col not in df.columns]

        if missing_cols:
            print(f"❌ Thiếu columns: {missing_cols}")
            return None

        # Hiển thị sample
        print("\n📝 Ví dụ câu hỏi đầu tiên:")
        print(f"   Q: {df.iloc[0]['Question'][:100]}...")

        return df

    except Exception as e:
        print(f"❌ Lỗi khi đọc CSV: {e}")
        return None

In [18]:
def ask_single_question(
    question: str, options: Dict[str, str], source_folder: str = None
) -> Dict[str, Any]:
    """Trả lời một câu hỏi MCQ."""

    payload = {"question": question, "options": options, "top_k": TOP_K, "use_hybrid": USE_HYBRID}

    if source_folder and pd.notna(source_folder):
        payload["source_folder"] = source_folder

    try:
        response = requests.post(f"{API_BASE_URL}/api/v1/ask", json=payload, timeout=TIMEOUT)
        response.raise_for_status()
        return response.json()

    except requests.exceptions.Timeout:
        print(f"    ⏱️  Timeout (>{TIMEOUT}s)")
        return {"error": "timeout", "predicted_option": None}

    except Exception as e:
        print(f"    ❌ Lỗi: {str(e)[:100]}")
        return {"error": str(e), "predicted_option": None}

In [None]:
def answer_all_questions(df: pd.DataFrame) -> pd.DataFrame:
    """Trả lời tất cả câu hỏi trong dataframe."""
    print_step(5, "Trả lời câu hỏi")

    results = []
    total = len(df)

    print(f"\n🎯 Bắt đầu trả lời {total} câu hỏi...\n")

    start_time = time.time()

    for idx, row in df.iterrows():
        q_num = idx + 1

        # Progress
        progress = (q_num / total) * 100
        print(f"[{q_num}/{total}] ({progress:.1f}%) ", end="")
        print(f"Q: {row['Question'][:60]}...")

        # Chuẩn bị options
        options = {"A": str(row["A"]), "B": str(row["B"]), "C": str(row["C"]), "D": str(row["D"])}

        # Lấy source_folder nếu có
        source_folder = row.get("source_folder", None)

        # Gọi API
        q_start = time.time()
        answer_data = ask_single_question(
            question=row["Question"], options=options, source_folder=source_folder
        )
        q_time = time.time() - q_start

        # Parse kết quả
        predicted = answer_data.get("predicted_option", "N/A")
        confidence = answer_data.get("confidence", "unknown")
        reasoning = answer_data.get("reasoning", "")
        error = answer_data.get("error", None)

        # Hiển thị kết quả
        conf_icon = {"high": "🟢", "medium": "🟡", "low": "🔴"}.get(confidence, "⚪")

        print(f"    → Đáp án: {predicted} {conf_icon} ({q_time:.1f}s)")

        # Lưu kết quả
        result = {
            "question_number": q_num,
            "question": row["Question"],
            "option_A": row["A"],
            "option_B": row["B"],
            "option_C": row["C"],
            "option_D": row["D"],
            "source_folder": source_folder if pd.notna(source_folder) else "",
            "predicted_answer": predicted,
            "confidence": confidence,
            "reasoning": reasoning[:500] if reasoning else "",  # Giới hạn độ dài
            "processing_time_seconds": round(q_time, 2),
            "error": error if error else "",
            "timestamp": datetime.now().isoformat(),
        }

        # Thêm timing details nếu có
        if "timing" in answer_data:
            timing = answer_data["timing"]
            result["retrieval_time_ms"] = timing.get("retrieval_ms", 0)
            result["generation_time_ms"] = timing.get("generation_ms", 0)

        results.append(result)

        # Ngắt dòng sau mỗi câu
        if q_num % 5 == 0:
            print()

    total_time = time.time() - start_time

    print("\n" + "─" * 80)
    print(f"✅ Hoàn thành! Tổng thời gian: {total_time:.1f}s")
    print(f"⏱️  Trung bình: {total_time / total:.1f}s/câu")
    print("─" * 80)

    return pd.DataFrame(results)

In [None]:
def save_results(results_df: pd.DataFrame):
    """Lưu kết quả ra CSV file."""
    print_step(6, "Lưu kết quả")

    try:
        results_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

        print(f"\n✅ Đã lưu kết quả vào: {OUTPUT_FILE}")
        print(f"📊 Tổng số câu: {len(results_df)}")

        # Thống kê
        conf_counts = results_df["confidence"].value_counts()
        print("\n📈 Thống kê độ tin cậy:")
        for conf, count in conf_counts.items():
            percentage = (count / len(results_df)) * 100
            print(f"  • {conf}: {count} câu ({percentage:.1f}%)")

        # Kiểm tra errors
        errors = results_df[results_df["error"] != ""]
        if len(errors) > 0:
            print(f"\n⚠️  Có {len(errors)} câu bị lỗi:")
            for idx, row in errors.iterrows():
                print(f"  • Câu {row['question_number']}: {row['error'][:50]}")

    except Exception as e:
        print(f"❌ Lỗi khi lưu file: {e}")

In [None]:
def display_summary(results_df: pd.DataFrame):
    """Hiển thị tóm tắt kết quả."""
    print_header("📊 TÓM TẮT KẾT QUẢ")

    total = len(results_df)
    successful = len(results_df[results_df["error"] == ""])

    print(f"\n✅ Tổng số câu: {total}")
    print(f"✅ Trả lời thành công: {successful}")
    print(f"❌ Lỗi: {total - successful}")

    # Top 5 câu có confidence cao
    high_conf = results_df[results_df["confidence"] == "high"].head(5)
    if len(high_conf) > 0:
        print(f"\n🟢 Top {len(high_conf)} câu có độ tin cậy cao:")
        for idx, row in high_conf.iterrows():
            print(
                f"  {row['question_number']}. Đáp án {row['predicted_answer']}: {row['question'][:60]}..."
            )

    # Câu có confidence thấp
    low_conf = results_df[results_df["confidence"] == "low"]
    if len(low_conf) > 0:
        print(f"\n🔴 {len(low_conf)} câu có độ tin cậy thấp:")
        for idx, row in low_conf.head(3).iterrows():
            print(f"  {row['question_number']}. {row['question'][:60]}...")

    print("\n" + "=" * 80)

In [None]:
def main():
    """Hàm chính để chạy toàn bộ pipeline."""

    print_header("🚀 RAG MCQ SYSTEM - AUTOMATIC ANSWERING")
    print(f"\n📅 Bắt đầu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Step 3: Check indexed documents
    doc_count = check_indexed_documents()
    if doc_count == 0:
        print("\n❌ Không có documents nào trong hệ thống!")
        print("Không thể trả lời câu hỏi. Vui lòng xử lý PDF trước!")
        return

    # Step 4: Load questions
    questions_df = load_questions()
    if questions_df is None:
        return

    # Step 5: Answer all questions
    results_df = answer_all_questions(questions_df)

    # Step 6: Save results
    save_results(results_df)

    # Display summary
    display_summary(results_df)

    print_header("✅ HOÀN TẤT!")
    print(f"\n📁 Kết quả đã được lưu tại: {OUTPUT_FILE.absolute()}")
    print(f"📅 Kết thúc: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("\n" + "=" * 80 + "\n")

In [None]:
# if __name__ == "__main__":
#     try:
#         main()
#     except KeyboardInterrupt:
#         print("\n\n⚠️  Đã dừng bởi người dùng (Ctrl+C)")
#     except Exception as e:
#         print(f"\n\n❌ Lỗi không mong đợi: {e}")
#         import traceback

#         traceback.print_exc()

Format câu hỏi: 1 câu hỏi + 4 options: search context
1. Tạo 5 search param cho mỗi câu: 1 question + 4 answer => tổng hợp lại mọi context liên quan => llm
2. Gộp question + options => 1 câu duy nhất => search

In [None]:
"Mục đích của từ ""luôn luôn"" và ""nào đó"", theo danh sách thẩm định thuật ngữ trong tài liệu Public_061, là hàm ý tính không mơ hồ và tính kiểm chứng của đặc tả?","Tính nhập nhằng, tính thiếu định lượng","Tính tuyệt đối, tính thuyết phục","Tính tuyệt đối, tính nhập nhằng","Tính thuyết phục, tính nhập nhằng"

In [None]:
query="""
Đâu không phải nguyên tắc cốt lõi của khảo sát đặc tả mức cao theo tài liệu Public_061?
- Hãy là khách hàng của sản phẩm",
- Hãy nghiên cứu các chuẩn và hướng dẫn hiện hành",
- Hãy xem xét và kiểm thử các phần mềm tương tự",
- Hãy phối hợp với đội nhóm và khách hàng"
"""

In [1]:
query="""
Mục đích của từ ""luôn luôn"" và ""nào đó"", theo danh sách thẩm định thuật ngữ trong tài liệu Public_061, là hàm ý tính không mơ hồ và tính kiểm chứng của đặc tả?
- Tính nhập nhằng, tính thiếu định lượng
- Tính tuyệt đối, tính thuyết phục
- Tính tuyệt đối, tính nhập nhằng
- Tính thuyết phục, tính nhập nhằng
"""

In [1]:
query=""" 
Một trong những đặc điểm khiến Logic bomb nguy hiểm là gì?
- Luôn xuất hiện dưới dạng file thực thi .exe
- Dễ dàng phát hiện bằng mắt thường
- Có khả năng ẩn trong mã nguồn phần mềm hợp pháp
- Không thể kích hoạt bằng sự kiện người dùng
"""

In [2]:
import requests
url = "http://localhost:8000/api/v1/search/"
payload = {
    "query": query,
    "top_k": 30,
    "use_hybrid": True
}
r = requests.post(url, json=payload, timeout=30)
print(r.status_code)

200


In [4]:
from pprint import pprint
pprint(r.json())

{'document_counts': [{'chunk_count': 18, 'doc_id': 'Public_264'},
                     {'chunk_count': 7, 'doc_id': 'Public_061'},
                     {'chunk_count': 5, 'doc_id': 'Public_251'}],
 'hits': [{'chunk_id': 'Public_264_chunk_0002',
           'chunk_text': '# Public_264 - Public_264\n'
                         '\n'
                         '## 1. Logic bomb là gì?\n'
                         '\n'
                         'Logic bomb là một đoạn mã độc hại được nhúng vào '
                         'trong một ứng dụng hoặc chương trình phần mềm. Khi '
                         'những điều kiện đã được lập trình sẵn trong mã được '
                         'đáp ứng, logic bomb sẽ tự động thực thi và có thể '
                         'gây ra nhiều tác hại như xóa dữ liệu, làm gián đoạn '
                         'hệ thống hoặc đánh cắp thông tin.\n'
                         '\n'
                         'Logic bomb đã tồn tại từ rất lâu trong lịch sử phát '
                    

In [None]:
# /Users/dohainam/test_rag_ask_api.py
import requests
import json
import os
from dotenv import load_dotenv

# 1. Tải biến môi trường (nếu có)
load_dotenv()

# Cấu hình API của bạn
# Đảm bảo biến môi trường API_BASE_URL được đặt, hoặc sử dụng mặc định
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
ASK_API_ENDPOINT = f"{API_BASE_URL}/api/v1/ask"
TIMEOUT = 300 # Timeout tính bằng giây, tăng nếu LLM mất nhiều thời gian

print(f"Kiểm tra API RAG tại: {ASK_API_ENDPOINT}\n")

# 2. Chuẩn bị câu hỏi mẫu
# THAY ĐỔI CÂU HỎI, TÙY CHỌN VÀ source_folder (nếu có) CHO PHÙ HỢP VỚI DỮ LIỆU CỦA BẠN
question_to_ask = "Đâu không phải nguyên tắc cốt lõi của khảo sát đặc tả mức cao theo tài liệu Public_061?"
options_for_question = {
    "A": "Hãy là khách hàng của sản phẩm",
    "B": "Hãy nghiên cứu các chuẩn và hướng dẫn hiện hành",
    "C": "Hãy xem xét và kiểm thử các phần mềm tương tự",
    "D": "Hãy phối hợp với đội nhóm và khách hàng"
}
# Nếu bạn muốn lọc theo source_folder, hãy bỏ comment dòng dưới và thay đổi giá trị
# source_folder_filter = "my_research_papers" # THAY ĐỔI THÀNH source_folder CỦA BẠN

payload = {
    "question": question_to_ask,
    "options": options_for_question,
    "top_k": 5,           # Số lượng chunks để lấy về từ OpenSearch
    "use_hybrid": True    # Sử dụng tìm kiếm kết hợp (BM25 + Vector + RRF)
}

# Thêm source_folder nếu được định nghĩa
# if 'source_folder_filter' in locals():
#     payload['source_folder'] = source_folder_filter

print("Payload gửi đi:")
pprint(json.dumps(payload, indent=2, ensure_ascii=False)) # ensure_ascii=False để hiển thị tiếng Việt

# 3. Gửi yêu cầu đến API
try:
    pprint(f"\nĐang gửi yêu cầu tới {ASK_API_ENDPOINT}...")
    response = requests.post(ASK_API_ENDPOINT, json=payload, timeout=TIMEOUT)
    response.raise_for_status() # Ném ngoại lệ cho các mã trạng thái lỗi (4xx hoặc 5xx)

    result = response.json()

    # 4. Hiển thị kết quả
    pprint("\n--- KẾT QUẢ TỪ RAG API ---")
    pprint(f"Câu hỏi: {result.get('question', 'N/A')}")
    pprint(f"Tùy chọn đã hỏi: {result.get('options', 'N/A')}")
    pprint(f"Đáp án dự đoán: {result.get('predicted_option', 'N/A')}")
    pprint(f"Văn bản trả lời: {result.get('answer_text', 'N/A')}")
    pprint(f"Lý do: {result.get('reasoning', 'N/A')}")
    pprint(f"Độ tin cậy: {result.get('confidence', 'N/A')}")
    pprint(f"Chế độ tìm kiếm: {result.get('search_mode', 'N/A')}")
    pprint(f"Mô hình LLM: {result.get('model', 'N/A')}")
    pprint(f"Từ bộ nhớ đệm: {result.get('from_cache', False)}")
    pprint(f"Thời gian: {result.get('timing', {}).get('total_ms', 'N/A')}ms")

    sources = result.get('sources', [])
    if sources:
        print("\n--- NGUỒN TÀI LIỆU (Sources) ---")
        for i, source in enumerate(sources):
            pprint(f"  Source {i+1}:")
            pprint(f"    Document ID: {source.get('document_id', 'N/A')}")
            pprint(f"    Chunk ID: {source.get('chunk_id', 'N/A')}")
            pprint(f"    Section: {source.get('section_name', 'N/A')}")
            pprint(f"    Score: {source.get('score', 'N/A'):.4f}")
            pprint(f"    Preview: {source.get('preview', 'N/A')[:150]}...") # Giới hạn độ dài preview
            pprint("-" * 20)
    else:
        print("\n⚠️ Không có nguồn tài liệu nào được trả về.")

except requests.exceptions.RequestException as e:
    print(f"\n❌ Lỗi khi gọi API: {e}")
    if response is not None:
        print(f"Mã trạng thái HTTP: {response.status_code}")
        print(f"Phản hồi lỗi từ server: {response.text}")
except json.JSONDecodeError:
    print(f"\n❌ Lỗi khi phân tích phản hồi JSON. Phản hồi thô:\n{response.text}")
except Exception as e:
    print(f"\n❌ Đã xảy ra lỗi không mong muốn: {e}")

Kiểm tra API RAG tại: http://localhost:8000/api/v1/ask

Payload gửi đi:
('{\n'
 '  "question": "Đâu không phải nguyên tắc cốt lõi của khảo sát đặc tả mức cao '
 'theo tài liệu Public_061?",\n'
 '  "options": {\n'
 '    "A": "Hãy là khách hàng của sản phẩm",\n'
 '    "B": "Hãy nghiên cứu các chuẩn và hướng dẫn hiện hành",\n'
 '    "C": "Hãy xem xét và kiểm thử các phần mềm tương tự",\n'
 '    "D": "Hãy phối hợp với đội nhóm và khách hàng"\n'
 '  },\n'
 '  "top_k": 5,\n'
 '  "use_hybrid": true\n'
 '}')
'\nĐang gửi yêu cầu tới http://localhost:8000/api/v1/ask...'
'\n--- KẾT QUẢ TỪ RAG API ---'
('Câu hỏi: Đâu không phải nguyên tắc cốt lõi của khảo sát đặc tả mức cao theo '
 'tài liệu Public_061?')
("Tùy chọn đã hỏi: {'A': 'Hãy là khách hàng của sản phẩm', 'B': 'Hãy nghiên "
 "cứu các chuẩn và hướng dẫn hiện hành', 'C': 'Hãy xem xét và kiểm thử các "
 "phần mềm tương tự', 'D': 'Hãy phối hợp với đội nhóm và khách hàng'}")
'Đáp án dự đoán: A'
('Văn bản trả lời: ANSWER: A,C\n'
 'REASONING: The

0 0 0 1 1 1 1 1 1
min: 0
max: 1
meadian: = 1