In [6]:
import os
import getpass
from dotenv import load_dotenv
load_dotenv()


True

In [7]:
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

PERSISTENT_DIR = "./chroma_db"
COLLECTION_NAME = "rag_mcp"

def get_collection():
    client = chromadb.PersistentClient(path=PERSISTENT_DIR)

    embedding_fn = OpenAIEmbeddingFunction(
        api_key=os.environ["OPENAI_API_KEY"],
        model_name="text-embedding-ada-002",  # works; if deprecated, use text-embedding-3-small
    )

    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=embedding_fn,
    )
    return client, collection


In [8]:
import os
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse

nest_asyncio.apply()  # REQUIRED for Jupyter on Windows

DATA_DIR = r"D:\Narwal\mcp_rag\data"
LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]

def ingest_data_dir():
    client, _ = get_collection()

    # wipe and re-create collection (dev-safe)
    try:
        client.delete_collection(name=COLLECTION_NAME)
    except Exception:
        pass

    client, collection = get_collection()

    parser = LlamaParse(
        api_key=LLAMA_CLOUD_API_KEY,
        result_type="text",
    )

    file_extractor = {".pdf": parser}

    documents = SimpleDirectoryReader(
        DATA_DIR,
        file_extractor=file_extractor,
    ).load_data()

    for doc in documents:
        collection.add(
            documents=[doc.text],
            metadatas=[doc.metadata],
            ids=[doc.doc_id],
        )

    print(f"Ingested {collection.count()} documents")


In [9]:
ingest_data_dir()


2025-12-22 02:15:07,110 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 617263f5-21c4-4ecf-9a41-8c6842f4964b


2025-12-22 02:15:08,370 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b "HTTP/1.1 200 OK"
2025-12-22 02:15:10,654 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b "HTTP/1.1 200 OK"
2025-12-22 02:15:13,873 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b "HTTP/1.1 200 OK"
2025-12-22 02:15:18,191 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b "HTTP/1.1 200 OK"
2025-12-22 02:15:45,022 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b "HTTP/1.1 200 OK"
2025-12-22 02:15:45,431 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/617263f5-21c4-4ecf-9a41-8c6842f4964b/result/text "HTTP/1.1 200 OK"


Error while parsing the file '<bytes/buffer>': Event loop is closed


2025-12-22 02:15:59,264 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f


2025-12-22 02:16:00,586 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:03,146 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:06,423 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:10,702 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:16,663 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:22,352 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/2b57ffe1-e8a1-4b78-aa60-5a6bc960cc3f "HTTP/1.1 200 OK"
2025-12-22 02:16:29,155 - INFO - HTTP Request: GET https://api.cloud.llamain

: 