In [1]:
!pip install pandas openai langchain PyMuPDF docx2txt faiss-cpu Pillow chromadb langchain-community langchain-text-splitters
import langchain

Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)

In [8]:
from pathlib import Path
import pandas as pd
import datetime
import torch

# PDF parsing
import fitz  # PyMuPDF for PDF text + images
import docx2txt
import base64

# Vector store & embeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.docstore.document import Document
from langchain_community.embeddings import OpenAIEmbeddings
from PIL import Image
import io

# Text splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# Optional: multimodal embeddings (OpenAI)
# text embedding: "text-embedding-3-small"
# image embedding: "image-embedding-3-small"
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from io import BytesIO
import os

In [33]:
client = OpenAI(api_key="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")

In [9]:
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Image embeddings (CLIP)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [10]:
import numpy as np
def load_documents(data_dir: str):
    docs = []

    for file_path in Path(data_dir).glob("*"):
        base_meta = {
            "source": file_path.name,
            "ingested_at": datetime.datetime.utcnow().isoformat()
        }


        # -------- PDF --------
        if file_path.suffix == ".pdf":
            pdf = fitz.open(str(file_path))
            for page_num, page in enumerate(pdf):

                # PDF text
                text = page.get_text("text")
                if text.strip():
                    docs.append(Document(
                        page_content=text,
                        metadata={**base_meta, "type": "text", "page": page_num}
                    ))

                # PDF images
                for img in page.get_images(full=True):
                    pix = fitz.Pixmap(pdf, img[0])
                    if pix.n - pix.alpha < 4:
                        img_b64 = base64.b64encode(
                            pix.tobytes()
                        ).decode("utf-8")

                        docs.append(Document(
                            page_content="[IMAGE]",
                            metadata={
                                **base_meta,
                                "type": "image",
                                "page": page_num,
                                "image_base64": img_b64
                            }
                        ))
                    pix = None

    return docs

# --------------------------------------------
# 3️⃣ Chunk Text
# --------------------------------------------

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

def chunk_documents(docs):
    out = []
    for d in docs:
        if d.metadata["type"] in ["image", "table"]:
            out.append(d)
        else:
            for c in splitter.split_text(d.page_content):
                out.append(Document(
                    page_content=c,
                    metadata=d.metadata
                ))
    return out

# --------------------------------------------
# 4️⃣ Embedding Function
# --------------------------------------------

def embed_document(doc: Document) -> np.ndarray:
    if doc.metadata["type"] == "image":
        img_bytes = base64.b64decode(doc.metadata["image_base64"])
        img = Image.open(BytesIO(img_bytes)).convert("RGB")

        inputs = clip_processor(
            images=img,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            vec = clip_model.get_image_features(**inputs)

        return vec.squeeze().cpu().numpy()

    else:
        return text_model.encode(doc.page_content)

# --------------------------------------------
# 5️⃣ Build FAISS (FIXED ✅)
# --------------------------------------------

def build_faiss_index(docs):
    embeddings = []
    texts = []
    metadatas = []

    for doc in docs:
        embeddings.append(np.array(embed_document(doc)))
        texts.append(doc.page_content)
        metadatas.append(doc.metadata)

    return FAISS.from_embeddings(
        embeddings=embeddings,
        texts=texts,
        metadatas=metadatas
    )

# --------------------------------------------
# 6️⃣ Query
# --------------------------------------------

def query_index(vectorstore, query, k=5):
    results = vectorstore.similarity_search(query, k=k)
    for r in results:
        print("\n---")
        print("Source:", r.metadata["source"])
        print("Page:", r.metadata.get("page"))
        print("Type:", r.metadata["type"])
        print(r.page_content[:300])

# --------------------------------------------
# 7️⃣ Run
# --------------------------------------------

if __name__ == "__main__":
    docs = load_documents("data/")
    docs = chunk_documents(docs)
    out_dir = 'medicalmanual_db'
    report = 'Medical_QnA'
    embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

    if not os.path.exists(out_dir):
         os.makedirs(out_dir)

    vectorstore = Chroma.from_documents(
       docs,
       embedding_model,
       persist_directory=out_dir
    )

    query_index(vectorstore, "How does FAISS similarity search work?")


  "ingested_at": datetime.datetime.utcnow().isoformat()



---
Source: Agent Quality.pdf
Page: 21
Type: text
Agent Quality
November 2025
22
Automated Metrics
Automated metrics provide speed and reproducibility. They are useful for regression testing 
and benchmarking outputs. Examples include:
•	 String-based similarity (ROUGE, BLEU), comparing generated text to references.
•	 Embedding-based similarity (B

---
Source: Agent Quality.pdf
Page: 19
Type: text
error state returned by the tool (e.g., an API's 404 error) and proceeding as if the call 
was successful.
4.	RAG Performance: If the agent uses Retrieval-Augmented Generation (RAG), the 
trajectory depends on the quality of its retrieved information. Failures include irrelevant 
document retrieval,

---
Source: Agent Quality.pdf
Page: 38
Type: text
"success" span.
•	 Tool Usage Frequency: A count of how often each tool (e.g., get_weather) appears as 
a span name, revealing which tools are most valuable.
 
These metrics are essential for operations, setting alerts, and managing the cost and