##### 1. Tokenising --> Chunking + Embedding

In [1]:
import logging
import os
from pathlib import Path
from typing import List, Dict, Tuple
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader

# ---------------------------
# Setup logging
# ---------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------
# Text loading & chunking
# ---------------------------
def load_text_from_file(path: Path) -> str:
    ext = path.suffix.lower()
    if ext in [".txt", ".md"]:
        return path.read_text(encoding="utf-8", errors="ignore")
    elif ext == ".pdf":
        text_parts = []
        try:
            reader = PdfReader(str(path))
            for page in reader.pages:
                # pypdf can return None for empty pages—guard it
                page_text = page.extract_text() or ""
                text_parts.append(page_text)
        except Exception as e:
            logger.warning(f"Failed to read PDF {path}: {e}")
        # document_parts = ["Page 1 text...", "Page 2 text...", "Page 3 text..."] AND "/n".join(document_parts) = "Page 1 text...\nPage 2 text...\nPage 3 text..."
        return "\n".join(text_parts)
    else:
        return ""  # unsupported


def chunk_text(
    text: str,
    chunk_size: int,       # characters per chunk (roughly ~150-250 tokens)
    chunk_overlap: int,    # characters of overlap
) -> List[str]:
    text = text.strip()
    if not text:
        return []

    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunk = text[start:end]
        # ensure we don’t cut off mid-word too badly by extending to next whitespace if possible
        if end < n:
            next_space = text.find(" ", end)
            if 0 < next_space - end < 20:  # small nudge to nearest space
                chunk = text[start:next_space]
                end = next_space
        chunks.append(chunk.strip())
        start = max(end - chunk_overlap, end)  # avoid infinite loop; no negative steps
    # dedupe empties
    return [c for c in chunks if c]


def chunk_docs(
    input_dir: Path,
    chunk_size,       # characters per chunk (roughly ~150-250 tokens)
    chunk_overlap,
) -> List[Tuple[Path, List[str]]]:
    docs = []
    for path in input_dir.rglob("*"):
        if path.is_file() and path.suffix.lower() in {".txt", ".md", ".pdf"}:
            text = load_text_from_file(path)
            if not text:
                continue
            docs.append((path, chunk_text(text, chunk_size, chunk_overlap)))
    return docs

In [3]:
# ---------------------------
# Embedding + Metadata
# ---------------------------
def collect_chunks_and_metadata(
    input_dir,
    chunk_size: int = 800,
    chunk_overlap: int = 150,
) -> Tuple[List[str], List[Dict]]:
    """
    Scan documents in input_dir, chunk them, and return both chunks and metadata.
    """
    logger.info(f"Scanning input directory: {input_dir}")
    chunks, metadatas = [], []

    for file_path, file_chunks in chunk_docs(input_dir, chunk_size, chunk_overlap):
        for i, chunk in enumerate(file_chunks):
            chunks.append(chunk)
            metadatas.append({
                "source": str(file_path),
                "chunk_id": i,
                "preview": chunk[:200].replace("\n", " ") + ("..." if len(chunk) > 200 else ""),
                "full": chunk
            })

    if not chunks:
        raise ValueError(f"No supported documents found in {input_dir}")

    logger.info(f"Collected {len(chunks)} chunks from documents")
    return chunks, metadatas


def embed_chunks(
    chunks: List[str],
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    batch_size: int = 64,
) -> np.ndarray:
    """
    Embed text chunks using a SentenceTransformer model.
    """
    logger.info(f"Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)

    vectors = []
    for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding"):
        batch = chunks[i:i + batch_size]
        vec = model.encode(batch, convert_to_numpy=True, normalize_embeddings=True)
        vectors.append(vec)

    X = np.vstack(vectors).astype("float32")
    logger.info(f"Finished embedding. Shape: {X.shape}")
    return X


def get_embeddings_and_metadata(
    input_dir,
    chunk_size: int = 800,
    chunk_overlap: int = 150,
    batch_size: int = 64,
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
):
    """
    Convenience wrapper: collects chunks+metadata, then embeds them.
    """
    chunks, metadatas = collect_chunks_and_metadata(input_dir, chunk_size, chunk_overlap)
    embeddings = embed_chunks(chunks, model_name=model_name, batch_size=batch_size)
    return embeddings, metadatas

In [5]:
# config
input_dir = Path().cwd().parent /'data' / 'kyndryl-docs-test'
model_name = 'sentence-transformers/all-mpnet-base-v2'
chunk_size = 200     # characters per chunk (roughly ~150-250 tokens)
chunk_overlap = 50    # characters of overlap
batch_size = 64 # batch building of index

embeddings, metadatas = get_embeddings_and_metadata(
    input_dir=input_dir,
    model_name=model_name,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    batch_size=64,
)

2025-09-19 11:52:06,916 [INFO] Scanning input directory: c:\Users\HillSeah\Documents\hill priv\vector-db-research\data\kyndryl-docs-test
2025-09-19 11:52:13,608 [INFO] Collected 350 chunks from documents
2025-09-19 11:52:13,610 [INFO] Loading embedding model: sentence-transformers/all-mpnet-base-v2
2025-09-19 11:52:13,626 [INFO] Use pytorch device_name: cpu
2025-09-19 11:52:13,629 [INFO] Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Batches: 100%|██████████| 2/2 [00:15<00:00,  7.90s/it]
Batches: 100%|██████████| 2/2 [00:09<00:00,  4.99s/it]t]
Batches: 100%|██████████| 2/2 [00:14<00:00,  7.30s/it]t]
Batches: 100%|██████████| 2/2 [00:12<00:00,  6.33s/it]t]
Batches: 100%|██████████| 2/2 [00:16<00:00,  8.28s/it]t]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.59s/it]t]
Embedding: 100%|██████████| 6/6 [01:14<00:00, 12.40s/it]
2025-09-19 11:53:34,830 [INFO] Finished embedding. Shape: (350, 768)


In [6]:
# (350, 384) --> 350 chunks/vector embeddings; 384 = embedding dimension (columns) of the embedding model, output = (1, 384) per chunk
print(embeddings.shape)

# 350 metadata dictionaries for 350 chunks/vector embeddings
print(len(metadatas))

(350, 768)
350


##### 2. Vector indexing and storage with Milvus

- Define schema (collections, fields, vector dimensions, etc.)
- Create indices on vector embeddings
    - Define index type, e.g., HNSW, IVF_FLAT, or IVFPQ
    - Define similarity metric (used both for 1. building the index and 2. for similarity search at retrieval time, e.g., L2, IP, cosine)

In [None]:
import numpy as np
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility


def build_collection_name(dataset, index_type, similarity_metric_type, hyperparameters):
    
    collection_name  = dataset + "_" + index_type + "_" + similarity_metric_type
    for k, v in hyperparameters.items():
        collection_name += f"_{k}_{v}"
        
    return collection_name


def get_milvus_connection(
    alias: str = "default",
    host: str = "localhost",
    port: str = "19530",
    reset: bool = False,
):
    """
    Get or create a Milvus connection.

    Parameters
    ----------
    alias : str
        Connection alias (default: "default").
    host : str
        Milvus host.
    port : str
        Milvus port.
    reset : bool
        If True, disconnect existing connection and reconnect.

    Returns
    -------
    alias : str
        The alias name of the connection.
    """
    if reset:
        connections.disconnect(alias)
        logger.info("Disconnected from Milvus Connection (RESET)")

    if not any(c[0] == alias and c[1] for c in connections.list_connections()):
        connections.connect(alias=alias, host=host, port=port)
        logger.info(f"Successfully Milvus connection on Host: {host}, Port: {port}")
        
    return alias


def close_milvus_connection(alias: str = "default"):
    """
    Disconnect from Milvus explicitly.
    """
    if any(c[0] == alias and c[1] for c in connections.list_connections()):
        connections.disconnect(alias)
        logger.info("Disconnected from Milvus Connection")
    
# ---------------------------
# Build Milvus index
# ---------------------------
def build_milvus_index(
    embeddings: np.ndarray,
    metadatas: List,
    similarity_metric_type: str,
    index_type: str,
    hyperparameters: Dict,
    collection_name: str = "document_embeddings",
    alias:str = "default",
    host: str = "localhost",
    port: str = "19530",
):
    # 1) Connect to Milvus
    logger.info(f"Connecting to Milvus at {host}:{port} ...")
    get_milvus_connection(alias=alias, host=host, port=port)
   
    # 2) Define Milvus collection schema if not exists
    dim = embeddings.shape[1]
    if utility.has_collection(collection_name):
        logger.info(f"Dropping existing collection: {collection_name}")
        utility.drop_collection(collection_name)

    logger.info(f"Creating new collection: {collection_name}")
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
        FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="chunk_id", dtype=DataType.INT64),
        FieldSchema(name="preview", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="full", dtype=DataType.VARCHAR, max_length=65535), 
    ]
    schema = CollectionSchema(fields, description="Document embeddings")
    collection = Collection(name=collection_name, schema=schema)

    # 6) Insert data (dictionary-style, order-independent)
    logger.info(f"Inserting {len(metadatas)} records into Milvus...")
    insert_data = [
        {
            "embedding": embeddings[i].tolist(),
            "source": metadatas[i]["source"],
            "chunk_id": metadatas[i]["chunk_id"],
            "preview": metadatas[i]["preview"],
            "full": metadatas[i]["full"],
        }
        for i in range(len(metadatas))
    ]
    collection.insert(insert_data)
    logger.info("Insertion complete.")

    # 7) Create index on vector field (for search efficiency)
    logger.info("Creating index on `embedding` field...")
    index_params = {
        "metric_type": similarity_metric_type,  # cosine similarity since embeddings are normalized
        "index_type": index_type,
        "params": hyperparameters # index-specific hyperparameters: e.g. nlist for IVF-based indices, M and efConstruction for HNSW
    }
    collection.create_index(field_name="embedding", index_params=index_params)

    collection.load()
    logger.info(
        f"[OK] Inserted {embeddings.shape[0]} vectors into Milvus collection `{collection_name}` "
        f"(dim={dim}, index type = {index_type}, similarity metric type = {similarity_metric_type})."
    )

In [28]:
# Config
similarity_metric_type = "IP"
index_type = "IVF_PQ"
hyperparameters = {
    "nlist": 1, # nlist=1 to simulate Flat exhaustive search
    "m": 16, # m=subvectors
    "nbits": 8 # nbits=bits per subvector
}
host = "localhost"
port = "19530"

dataset_name = "kyndryl_pdfs"
collection_name = build_collection_name(dataset=dataset_name, index_type=index_type, similarity_metric_type=similarity_metric_type, hyperparameters=hyperparameters)

# Run -- ensure milvus docker containers are running: >> docker compose up -d
build_milvus_index(
    embeddings=embeddings,
    metadatas=metadatas,
    similarity_metric_type=similarity_metric_type,
    index_type=index_type,
    hyperparameters=hyperparameters,
    collection_name=collection_name,
    host=host,
    port=port,
)

2025-09-19 13:50:20,429 [INFO] Connecting to Milvus at localhost:19530 ...
2025-09-19 13:50:20,470 [INFO] Creating new collection: kyndryl_pdfs_IVF_PQ_IP_nlist_1_m_16_nbits_8


2025-09-19 13:50:20,925 [INFO] Inserting 350 records into Milvus...
2025-09-19 13:50:21,264 [INFO] Insertion complete.
2025-09-19 13:50:21,266 [INFO] Creating index on `embedding` field...
2025-09-19 13:50:22,697 [INFO] [OK] Inserted 350 vectors into Milvus collection `kyndryl_pdfs_IVF_PQ_IP_nlist_1_m_16_nbits_8` (dim=768, index type = IVF_PQ, similarity metric type = IP).


#### 3. Vector Search, Top K (ANN)

In [None]:
import numpy as np
from typing import List, Dict, Optional
from pymilvus import Collection
from sentence_transformers import SentenceTransformer  # example embedding model
import logging

logger = logging.getLogger(__name__)


def search_milvus(
    queries: List[str],
    embedding_model: str = "all-MiniLM-L6-v2",
    similarity_metric_type: str = "COSINE",
    index_search_params: Optional[Dict] = None,
    top_k: int = 5,
    alias: str = "default",
    host: str = "localhost",
    port: str = "19530",
    collection_name: str = "document_embeddings"
) -> List[List[Dict]]:
    """
    Perform similarity search in Milvus and return top-k results with metadata.

    Args:
        queries: List of raw query strings
        embedding_model: Name of the embedding model (e.g., from sentence-transformers)
        similarity_metric_type: "COSINE", "L2", or "IP"
        index_search_params: Index-specific search parameters (e.g., {"ef": 200} for HNSW, {"nprobe": 10} for IVF)
        top_k: Number of top results to return
        alias: Alias of targeted Milvus database to search
        host: Host of targeted Milvus database to search
        port: Port of targeted Milvus database to search
        collection_name: Name of the Milvus collection

    Returns:
        List of results per query, each as a list of dicts containing 'score' and metadata fields
    """
    # Connect to Milvus (if not already connected)
    get_milvus_connection(alias=alias, host=host, port=port)

    # Generate embeddings for queries
    logger.info(f"Encoding {len(queries)} queries using {embedding_model} ...")
    model = SentenceTransformer(embedding_model)
    query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)

    # Load the collection
    collection = Collection(collection_name)
    collection.load()
    logger.info(f"Loaded collection {collection_name} | Host: {host} | Port: {port}...")

    if index_search_params is None:
        index_search_params = {}

    results = collection.search(
        data=query_embeddings.tolist(),
        anns_field="embedding", 
        param=index_search_params,
        limit=top_k,
        metric_type=similarity_metric_type,
        output_fields=["source", "chunk_id", "preview", "full"]
    )

    all_results = []
    for i, hits in enumerate(results):
        query_results = []
        for hit in hits:
            query_results.append({
                "id": hit.id,
                "score": hit.score,
                "source": hit.entity.get("source"),
                "chunk_id": hit.entity.get("chunk_id"),
                "preview": hit.entity.get("preview"),
                "full": hit.entity.get("full"),
                "query": queries[i]
            })
        all_results.append(query_results)

    return all_results

In [31]:
# Config
index_search_params = None
top_k = 5

# Same as during storing in target collection
embedding_model = 'sentence-transformers/all-mpnet-base-v2'
similarity_metric_type = "IP"
alias = "default"
host = "localhost"
port = "19530"
dataset_name = "kyndryl_pdfs"
collection_name = build_collection_name(dataset=dataset_name, index_type=index_type, similarity_metric_type=similarity_metric_type, hyperparameters=hyperparameters)

# Query
queries = [
    "How much does Kyndryl cover for surgeries"
]

# Main
all_results = search_milvus(
    queries=queries,
    embedding_model=embedding_model,
    similarity_metric_type=similarity_metric_type,
    index_search_params=index_search_params,
    top_k=top_k,
    alias=alias,
    host=host,
    port=port,
    collection_name=collection_name
)

2025-09-19 13:52:42,136 [INFO] Encoding 1 queries using sentence-transformers/all-mpnet-base-v2 ...


2025-09-19 13:52:42,158 [INFO] Use pytorch device_name: cpu
2025-09-19 13:52:42,160 [INFO] Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


In [20]:
for i, results in enumerate(all_results):
    print(f"Results for Query {i+1}: {queries[i]}")
    for r in results:
        print(f"    [{r['score']:.4f}] {r['preview']}")
        
# super interesting (or not really)
# Results and scores were exactly the same as FAISS (Same similarity metric and index, IP + Flat)

# (1) sentence-transformers/all-MiniLM-L6-v2: 
# Results for Query 1: How much does Kyndryl cover for surgeries
    # [0.6130] Yes, you could but the hospital might impose an administration fee. The administration fee is not payable under Kyndryl medical program.  3. Q: If I am covered under Kyndryl’s medical insurance polici...
    # [0.5646] 101.42 Dependant 99.83 Deluxe (Opt-up) Employee 209.17 Dependant 204.83 Note:  ▪ Kyndryl’s 2025 price tags have been adjusted to reflect the rising cost of medical treatment.  ▪ Kyndryl’s flex benefit...
    # [0.5614] top of the bill.  There must not be any  outstanding amount due. 38 KYNDRYL FUNDED  PROGRAMS 38  39 Kyndryl Funded Benefits ▪ Applicable to dependants enrolled under Insured Medical Plan ▪ This progra...
    # [0.5316] Kyndryl  funds these for  employees) Policy No. 79343 53015577 79343 Eligibility All Employees All Employees All Employees & their Eligible Dependants Basis of Cover • All Employees: 2 times  annual g...
    # [0.4904] surgery is covered *S$35,000 Surgical benefits are Up to 50%  of max. limit Surgical schedule applies to limit  above S$1,500 for surgery in  private hospitals Day surgery is covered 3 Surgical Benefi..

# (2) 'sentence-transformers/all-mpnet-base-v2'
# Results for Query 1: How much does Kyndryl cover for surgeries
#     [0.6114] Yes, you could but the hospital might impose an administration fee. The administration fee is not payable under Kyndryl medical program.  3. Q: If I am covered under Kyndryl’s medical insurance polici...
#     [0.5777] 101.42 Dependant 99.83 Deluxe (Opt-up) Employee 209.17 Dependant 204.83 Note:  ▪ Kyndryl’s 2025 price tags have been adjusted to reflect the rising cost of medical treatment.  ▪ Kyndryl’s flex benefit...
#     [0.5753] Kyndryl  funds these for  employees) Policy No. 79343 53015577 79343 Eligibility All Employees All Employees All Employees & their Eligible Dependants Basis of Cover • All Employees: 2 times  annual g...
#     [0.5571] surgery is covered *S$35,000 Surgical benefits are Up to 50%  of max. limit Surgical schedule applies to limit  above S$1,500 for surgery in  private hospitals Day surgery is covered 3 Surgical Benefi...
#     [0.5461] for surgery in private  hospitals Day surgery is covered *$25,000 Surgical benefits are Up to 50% of  max. limit Surgical schedule applies to limit  above S$1,500 for surgery in private  hospitals Day

Results for Query 1: How much does Kyndryl cover for surgeries
    [0.6114] Yes, you could but the hospital might impose an administration fee. The administration fee is not payable under Kyndryl medical program.  3. Q: If I am covered under Kyndryl’s medical insurance polici...
    [0.5777] 101.42 Dependant 99.83 Deluxe (Opt-up) Employee 209.17 Dependant 204.83 Note:  ▪ Kyndryl’s 2025 price tags have been adjusted to reflect the rising cost of medical treatment.  ▪ Kyndryl’s flex benefit...
    [0.5753] Kyndryl  funds these for  employees) Policy No. 79343 53015577 79343 Eligibility All Employees All Employees All Employees & their Eligible Dependants Basis of Cover • All Employees: 2 times  annual g...
    [0.5571] surgery is covered *S$35,000 Surgical benefits are Up to 50%  of max. limit Surgical schedule applies to limit  above S$1,500 for surgery in  private hospitals Day surgery is covered 3 Surgical Benefi...
    [0.5461] for surgery in private  hospitals Day surgery is covered