In [3]:
import sys
from pathlib import Path

# Add project root to sys.path

sys.path.append(str(Path().cwd().parent.parent))
from utils.vectordb import build_collection_name
from core.embedder import get_embeddings_and_metadata
from core.milvus_index import build_milvus_index


# Config

# --Ingestion +  Chunking + Embedding --
input_dir = Path().cwd() /'data' / 'kyndryl-docs-test'
model_name = 'sentence-transformers/all-mpnet-base-v2'
chunk_size = 200     # characters per chunk (roughly ~150-250 tokens)
chunk_overlap = 50    # characters of overlap
batch_size = 64 # batch building of index

# -- Building Index --
similarity_metric_type = "IP"
index_type = "IVF_PQ"
hyperparameters = {
    "nlist": 1, # nlist=1 to simulate Flat exhaustive search
    "m": 16, # m=subvectors
    "nbits": 8 # nbits=bits per subvector
}
host = "localhost"
port = "19530"
dataset_name = "kyndryl_pdfs"
collection_name = build_collection_name(dataset=dataset_name, index_type=index_type, similarity_metric_type=similarity_metric_type, hyperparameters=hyperparameters)

embeddings, metadatas = get_embeddings_and_metadata(
    input_dir=input_dir,
    model_name=model_name,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    batch_size=batch_size
)

build_milvus_index( # Ensure milvus docker containers are running: >> docker compose up -d
    embeddings=embeddings,
    metadatas=metadatas,
    similarity_metric_type=similarity_metric_type,
    index_type=index_type,
    hyperparameters=hyperparameters,
    collection_name=collection_name,
    host=host,
    port=port,
)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 