In [1]:
!pip install -qU docling sentence_transformers chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.1/195.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.4/483.4 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import tempfile
import uuid
from typing import List
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from sentence_transformers import SentenceTransformer
import chromadb
import os
import gc
import torch

EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
MAX_CHUNKING_TOKENS = 1024
MIN_CHUNKING_TOKENS = 128

DATA_DIR = "/content/drive/MyDrive/preprocessed_text"
CHROMA_DIR = "/content/chroma_db"
COLLECTION_NAME = "viet_history"
BATCH_SIZE = 16

def load_txt(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def convert_txt_via_md(path, converter):
    content = load_txt(path)
    title = os.path.splitext(os.path.basename(path))[0]
    md_content = f"# {title}\n\n" + content

    # tạo file .md tạm thời
    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False, encoding="utf-8") as tmp:
        tmp.write(md_content)
        tmp_path = tmp.name

    try:
        doc = converter.convert(source=tmp_path).document
    finally:
        os.remove(tmp_path)
    return doc

def infer_category(root_dir: str, file_path: str) -> str:
    # category is the top-level folder under root_dir (e.g. ChinhTri, Con_Nguoi, ...)
    rel = os.path.relpath(file_path, root_dir)
    parts = rel.split(os.path.sep)
    if len(parts) >= 2:
        return parts[0]
    return "root"



def chunk_file_with_docling(path: str, converter: DocumentConverter, chunker: HybridChunker):
    """
    Trả về list các chunk object có thuộc tính .text và .metadata
    """
    try:
        print("Chunking:", path)
        if path.endswith(".txt"):
            doc = convert_txt_via_md(path=path, converter=converter)
            chunks = list(chunker.chunk(dl_doc=doc))
            # đảm bảo không giữ tham chiếu 'doc' lâu
            try:
                del doc
            except Exception:
                pass
            return chunks
    except Exception as e:
        print("  chunk_file_with_docling fallback due to:", e)
        text = load_txt(path)
        class SimpleChunk:
            def __init__(self, text):
                self.text = text
                self.metadata = {}
        return [SimpleChunk(text)]

def flush_to_chroma(embedder, collection, docs, metadatas, ids, batch_size):
    """
    Encode & add từng batch nhỏ vào Chroma, và giải phóng mem sau mỗi batch.
    Trả về None (lists được clear bởi caller nếu cần).
    """
    if not docs:
        return

    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i:i+batch_size]
        batch_mds = metadatas[i:i+batch_size]
        batch_ids = ids[i:i+batch_size]
        # encode (nên dùng device="cpu" nếu GPU thường xuyên OOM)
        try:
            embs = embedder.encode(batch_docs, convert_to_numpy=True, show_progress_bar=False)
            collection.add(documents=batch_docs, metadatas=batch_mds, ids=batch_ids, embeddings=embs.tolist())
        except RuntimeError as e:
            print("  RuntimeError during embed: ", e)
            # nếu OOM trên GPU, thử encode trên CPU
            try:
                print("  Retrying encode on CPU...")
                embs = embedder.encode(batch_docs, device="cpu", convert_to_numpy=True, show_progress_bar=False)
                collection.add(documents=batch_docs, metadatas=batch_mds, ids=batch_ids, embeddings=embs.tolist())
            except Exception as e2:
                print("  Failed to encode batch even on CPU:", e2)
                raise

        # giải phóng bộ nhớ tạm thời
        try:
            del embs
        except Exception:
            pass
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

def main(data_dir: str, chroma_dir: str, collection_name: str, max_batch: int):
    print("Starting...")
    converter = DocumentConverter()
    chunker = HybridChunker(
        max_tokens=MAX_CHUNKING_TOKENS,
        min_tokens=MIN_CHUNKING_TOKENS,
        overlap_tokens=100,
        merge_peers=True
    )
    print("Init chunker finished...")

    # init embedder
    embedder = SentenceTransformer(EMBEDDING_MODEL)
    print("Init embedding model finished...")

    # init chroma client
    client = chromadb.PersistentClient(path=chroma_dir)
    try:
        client.delete_collection(name=collection_name)
    except Exception:
        pass
    collection = client.get_or_create_collection(name=collection_name)
    print("Create chromaDB finished")

    all_docs: List[str] = []
    all_metadatas: List[dict] = []
    all_ids: List[str] = []

    files_count = 0
    chunks_count = 0

    print("Starting chunking and save...")
    for root, dirs, files in os.walk(data_dir):
        print("Walking:", root)
        print("  os.walk -> dirs:", len(dirs), "files:", len(files))

        # fallback listdir nếu needed
        if not files:
            try:
                listed = os.listdir(root)
                files = [p for p in listed if os.path.isfile(os.path.join(root, p))]
            except Exception:
                files = []

        for fn in files:
            if not fn.lower().endswith('.txt'):
                continue

            files_count += 1
            fullpath = os.path.join(root, fn)
            relpath = os.path.relpath(fullpath, data_dir)
            category = infer_category(data_dir, fullpath)

            # chunk using docling
            chunks = chunk_file_with_docling(fullpath, converter, chunker)

            for i, chunk in enumerate(chunks):
                text = getattr(chunk, "text", None) or (chunk.get("text") if isinstance(chunk, dict) else None)
                metadata = getattr(chunk, "metadata", {}) if not isinstance(chunk, dict) else chunk.get("metadata", {})
                if text is None:
                    text = chunk if isinstance(chunk, str) else str(chunk)

                md = dict(metadata) if metadata else {}
                md.update({
                    "category": category,
                    "file": fn,
                    "relative_path": relpath,
                    "chunk_index": i,
                })

                uid = f"{relpath.replace(os.path.sep, '__')}__{i}__{uuid.uuid4().hex[:8]}"

                all_docs.append(text)
                all_metadatas.append(md)
                all_ids.append(uid)
                chunks_count += 1

                # nếu tới batch size thì flush (giữ hành vi ban đầu)
                if len(all_docs) >= max_batch:
                    print(f"Embedding & adding batch of {len(all_docs)} chunks to Chroma...")
                    flush_to_chroma(embedder, collection, all_docs, all_metadatas, all_ids, max_batch)
                    # clear lists
                    all_docs.clear(); all_metadatas.clear(); all_ids.clear()

            # --- Sau khi xử lý xong 1 file: flush các docs đang chờ để giải phóng mem ---
            if all_docs:
                print(f"File done -> embedding & adding remaining {len(all_docs)} chunks to Chroma (flush per-file)...")
                flush_to_chroma(embedder, collection, all_docs, all_metadatas, all_ids, max_batch)
                all_docs.clear(); all_metadatas.clear(); all_ids.clear()

            # xóa chunks và ép GC ngay sau khi xong file để giải phóng mem
            try:
                del chunks
            except Exception:
                pass
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    # final flush (nếu còn)
    if all_docs:
        print(f"Final embedding & adding batch of {len(all_docs)} chunks to Chroma...")
        flush_to_chroma(embedder, collection, all_docs, all_metadatas, all_ids, max_batch)
        all_docs.clear(); all_metadatas.clear(); all_ids.clear()

    # persist DB
    try:
        client.persist()
    except Exception:
        pass

    print(f"Done. Files processed: {files_count}, total chunks indexed: {chunks_count}")


if __name__ == "__main__":
    try:
      gc.collect()
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
    except Exception:
      pass

    main(DATA_DIR, CHROMA_DIR, COLLECTION_NAME, BATCH_SIZE)


Starting...
Init chunker finished...
Init embedding model finished...
Create chromaDB finished
Starting chunking and save...
Walking: /content/drive/MyDrive/preprocessed_text
  os.walk -> dirs: 4 files: 0
Walking: /content/drive/MyDrive/preprocessed_text/Van_Hoa
  os.walk -> dirs: 0 files: 2
Chunking: /content/drive/MyDrive/preprocessed_text/Van_Hoa/ThanNguoiVaDatViet.txt


Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
Embedding & adding batch of 16 chunks to Chroma...
File done -> embedding & adding

In [6]:
import gc; gc.collect()

6864

In [7]:
!du -sh /content/chroma_db

267M	/content/chroma_db


In [8]:
!zip -r chroma_db.zip /content/chroma_db

  adding: content/chroma_db/ (stored 0%)
  adding: content/chroma_db/5bedf9f7-8dc8-46da-87ae-8416a2f49894/ (stored 0%)
  adding: content/chroma_db/5bedf9f7-8dc8-46da-87ae-8416a2f49894/header.bin (deflated 61%)
  adding: content/chroma_db/5bedf9f7-8dc8-46da-87ae-8416a2f49894/link_lists.bin (stored 0%)
  adding: content/chroma_db/5bedf9f7-8dc8-46da-87ae-8416a2f49894/data_level0.bin (deflated 100%)
  adding: content/chroma_db/5bedf9f7-8dc8-46da-87ae-8416a2f49894/length.bin (deflated 24%)
  adding: content/chroma_db/chroma.sqlite3 (deflated 57%)
  adding: content/chroma_db/42b046d0-fff6-4c07-9ef1-43fa47dbed65/ (stored 0%)
  adding: content/chroma_db/42b046d0-fff6-4c07-9ef1-43fa47dbed65/header.bin (deflated 61%)
  adding: content/chroma_db/42b046d0-fff6-4c07-9ef1-43fa47dbed65/link_lists.bin (stored 0%)
  adding: content/chroma_db/42b046d0-fff6-4c07-9ef1-43fa47dbed65/data_level0.bin (deflated 100%)
  adding: content/chroma_db/42b046d0-fff6-4c07-9ef1-43fa47dbed65/length.bin (deflated 100%)
  

In [9]:
from google.colab import files
files.download('chroma_db.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>