# Chunking Function

In [1]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")

def chunk_text(text, chunk_size=500, overlap=50):
    tokens = word_tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk = tokens[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap
    return chunks


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\latik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


# Embedding with BAAI/bge-small-en

In [2]:
from sentence_transformers import SentenceTransformer

# Load the model once
embed_model = SentenceTransformer("BAAI/bge-small-en")

def get_embeddings(chunks):
    return embed_model.encode(chunks, convert_to_numpy=True).tolist()

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Prepare Metadata Schema

In [3]:
def prepare_upsert_items(chunks, embeddings, doc_id, source, section):
    items = []
    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
        items.append({
            "id": f"{doc_id}_{i}",
            "vector": emb,
            "metadata": {
                "doc_id": doc_id,
                "source": source,
                "section": section,
                "position": i
            },
            "text": chunk   # keep raw text for retrieval
        })
    return items
