In [1]:
# index_wikieval.py
import os, hashlib
from typing import List
from datasets import load_dataset
from elasticsearch import Elasticsearch, helpers
import google.generativeai as genai
from marshmallow import pprint

# ===== Config =====
ES_URL   = os.getenv("ES_URL", "http://localhost:9200")
ES_INDEX ="hotpotqa_ctx"
GEMINI_API_KEY = "";
EMBED_MODEL    = "models/gemini-embedding-001"  # 3072-d
EMBED_DIMS     = 3072


genai.configure(api_key=GEMINI_API_KEY)
es = Elasticsearch(ES_URL)

def ensure_index():
    # Tối giản: tạo mapping nếu chưa có (để nhận dense_vector)
    if es.indices.exists(index=ES_INDEX):
        return
    es.indices.create(
        index=ES_INDEX,
        body={
            "settings": {"index": {"number_of_shards": 1, "number_of_replicas": 0}},
            "mappings": {
                "properties": {
                    "title":   {"type": "text"},
                    "page_from": {"type": "integer", "null_value": -1},   # cho phép null -> map thành -1
                    "page_to":   {"type": "integer", "null_value": -1},
                    "text":    {"type": "text"},
                    "vector":  {"type": "dense_vector", "dims": EMBED_DIMS, "index": True, "similarity": "cosine"},
                }
            }
        }
    )

In [18]:
ensure_index()

In [2]:
subset      = "distractor"    # "distractor" | "fullwiki"
split       = "validation"         # "train" | "validation" | "test"
batch_size  = 256
use_embed   = True            # False = BM25-only
dedup       = True

ds = load_dataset("hotpotqa/hotpot_qa", subset, split=split)
seen = set()
batch, count = [], 0
print(ds.column_names) 
print(f"Tổng {len(ds)} mục trong HotpotQA {subset}/{split}")

['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']
Tổng 7405 mục trong HotpotQA distractor/validation


In [3]:
import json
want = 477
count = 0
for ex in ds:
    out = {
        "id": ex.get("id"),
        "answer": ex.get("answer"),
        "supporting_facts": ex.get("supporting_facts"),  # dạng [[title, sent_id], ...]
    }

# In ra màn hình (UTF-8, có format)

# (Tuỳ chọn) Lưu ra file
    line = json.dumps(out, ensure_ascii=False, separators=(",", ":"))
    with open("h.json", "a", encoding="utf-8") as f:
        f.write(line + "\n")
    count += 1
    if count == want:
        break
print("[OK] Saved to h.json")


[OK] Saved to h.json


In [None]:
# --- CHIA TRƯỚC, KHÔNG CHẠY INDEX ---
from math import ceil
from elasticsearch import helpers
from tqdm.auto import tqdm
CHUNK_SIZE = 1  # mỗi cục 2000 mẫu
TOTAL = len(ds)
NUM_CHUNKS = ceil(TOTAL / CHUNK_SIZE)
CNT_TIME = 0
# Danh sách phần (start, end) để bạn tự chạy dần
PARTS = [(i * CHUNK_SIZE, min((i + 1) * CHUNK_SIZE, TOTAL)) for i in range(NUM_CHUNKS)]
print(f"[SPLIT] Tổng {TOTAL} mẫu → {NUM_CHUNKS} phần, mỗi phần {CHUNK_SIZE} (phần cuối có thể nhỏ hơn).")
for i, (s, e) in enumerate(PARTS, 1):
    print(f"  - PART {i}: [{s}:{e}) size={e - s}")

# --- HÀM CHẠY 1 PHẦN (GỌI TAY KHI MUỐN) ---
def index_part(part_idx: int, show_bar: bool = True):
    """
    Index 1 phần theo logic gốc của bạn.
    part_idx: 0..NUM_CHUNKS-1
    """
    assert 0 <= part_idx < NUM_CHUNKS, f"part_idx phải trong [0, {NUM_CHUNKS-1}]"
    start, end = PARTS[part_idx]
    print(f"\n[RUN] PART {part_idx+1}/{NUM_CHUNKS} → Range [{start}:{end}) size={end-start}")

    ds_part = ds.select(range(start, end))

    local_batch = []
    local_count = 0

    iterator = tqdm(ds_part, total=len(ds_part), disable=not show_bar, desc=f"Part {part_idx+1}/{NUM_CHUNKS}")
    for ex in iterator:
        ctx = ex.get("context") or {}
        sents  = ctx.get("sentences") or []
        for sent in sents:
            
            txt = " ".join([s.strip() for s in sent if isinstance(s, str) and s.strip()])
            if not txt or (dedup and txt in seen):
                continue
            
            
            if dedup:
                seen.add(txt)

            vec = ""

        
            body = {
                    "title": "HotpotQA",
                    "page_from": None,
                    "page_to": None,
                    "text":   txt,
                    "vector": vec,
            }
            
    print(f"[OK] PART {part_idx+1} done. Indexed docs (this part): {local_count}")

# --- CÁCH DÙNG ---
# Không tự chạy gì cả. Khi muốn chạy 1 phần, gọi:
# index_part(0)   # chạy PART 1
# index_part(1)   # chạy PART 2
# ...


In [None]:
index_part(0)   # chạy PART 1

In [None]:
print(len(seen), "văn bản đã seen")

In [None]:
index_part(4)   # chạy PART 1