In [6]:
import os
import getpass
from dotenv import load_dotenv
load_dotenv()


True

In [7]:
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

PERSISTENT_DIR = "./chroma_db"
COLLECTION_NAME = "rag_mcp"

def get_collection():
    client = chromadb.PersistentClient(path=PERSISTENT_DIR)

    embedding_fn = OpenAIEmbeddingFunction(
        api_key=os.environ["OPENAI_API_KEY"],
        model_name="text-embedding-ada-002",  # works; if deprecated, use text-embedding-3-small
    )

    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=embedding_fn,
    )
    return client, collection


In [8]:
import os
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse

nest_asyncio.apply()  # REQUIRED for Jupyter on Windows

DATA_DIR = r"D:\Narwal\mcp_rag\data"
LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]

def ingest_data_dir():
    client, _ = get_collection()

    # wipe and re-create collection (dev-safe)
    try:
        client.delete_collection(name=COLLECTION_NAME)
    except Exception:
        pass

    client, collection = get_collection()

    parser = LlamaParse(
        api_key=LLAMA_CLOUD_API_KEY,
        result_type="text",
    )

    file_extractor = {".pdf": parser}

    documents = SimpleDirectoryReader(
        DATA_DIR,
        file_extractor=file_extractor,
    ).load_data()

    for doc in documents:
        collection.add(
            documents=[doc.text],
            metadatas=[doc.metadata],
            ids=[doc.doc_id],
        )

    print(f"Ingested {collection.count()} documents")


In [2]:
import os, requests
from dotenv import load_dotenv
load_dotenv()
VOYAGE_API_KEY = os.environ["VOYAGE_API_KEY"]
VOYAGE_RERANK_URL = "https://api.voyageai.com/v1/rerank"
def voyage_rerank(query: str, chunks: list[str], top_k: int = 5, model: str = "rerank-2.5"):
    payload = {
        "model": model,
        "query": query,
        "documents": chunks,
        "top_k": top_k,
    }
    headers = {"Authorization": f"Bearer {VOYAGE_API_KEY}"}
    r = requests.post(VOYAGE_RERANK_URL, json=payload, headers=headers)
    r.raise_for_status()
    return r.json()

In [3]:
query = "What are the health benefits of meditation?"

documents = [
    "Several clinical studies have shown that regular meditation can help reduce stress and anxiety levels.",
    "Meditation involves focusing the mind and eliminating distractions, often through breathing techniques or guided imagery.",
    "A daily meditation practice has been associated with lower blood pressure and improved sleep quality in adults.",
    "The city of Kyoto is famous for its Zen temples, where meditation has been practiced for centuries.",
    "People who meditate frequently often report feeling calmer and more focused throughout the day.",
    "Research suggests meditation may lower the risk of heart disease by reducing inflammation and improving heart rate variability.",
    "Meditation apps have become increasingly popular, offering guided sessions on mindfulness and relaxation.",
    "A 2021 meta-analysis found that meditation can reduce symptoms of depression when used alongside other treatments.",
    "Some forms of meditation emphasize compassion and kindness, aiming to improve emotional well-being.",
    "Athletes sometimes use meditation techniques to enhance concentration and mental resilience during competition.",
]
ranks = voyage_rerank(query, documents, top_k=5)
ranks

{'object': 'list',
 'data': [{'relevance_score': 0.84765625, 'index': 2},
  {'relevance_score': 0.84375, 'index': 0},
  {'relevance_score': 0.8203125, 'index': 5},
  {'relevance_score': 0.8125, 'index': 4},
  {'relevance_score': 0.796875, 'index': 7}],
 'model': 'rerank-2.5',
 'usage': {'total_tokens': 214}}

In [None]:
ranks

{'object': 'list',
 'data': [{'relevance_score': 0.84765625, 'index': 2},
  {'relevance_score': 0.84375, 'index': 0},
  {'relevance_score': 0.8203125, 'index': 5},
  {'relevance_score': 0.8125, 'index': 4},
  {'relevance_score': 0.796875, 'index': 7}],
 'model': 'rerank-2.5',
 'usage': {'total_tokens': 214}}

: 