In [None]:
import os
import time
from dotenv import load_dotenv
import pdfplumber
from pymilvus import connections, Collection
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Zilliz
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from docx import Document as DocxDocument

In [None]:
load_dotenv()
ZILLIZ_HOST = os.getenv("ZILLIZ_HOST")
ZILLIZ_API_KEY = os.getenv("ZILLIZ_API_KEY")
assert ZILLIZ_HOST and ZILLIZ_API_KEY, "❌ Zilliz credentials missing"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
# Set your PDF directory path
pdf_folder = r"C:\Users\Nahid\OneDrive - Calmi2\Desktop\Agentic AI\3-Data Transformer\2.4-VectorDatabase\Pinecone\uploaded_files"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

# Initialize containers
all_text = []
all_tables = []
total_pages = 0

# Process each PDF
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages += len(pdf.pages)

        for page in pdf.pages:
            # Extract text
            text = page.extract_text()
            if text:
                all_text.append(text)

            # Extract tables
            tables = page.extract_tables()
            for table in tables:
                if table:  # skip empty tables
                    all_tables.append(table)

# Summary
print(f"✅ PDFs processed: {len(pdf_files)}")
print(f"📄 Total pages: {total_pages}")
print(f"📝 Text chunks collected: {len(all_text)}")
print(f"📊 Tables extracted: {len(all_tables)}")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

✅ PDFs processed: 2
📄 Total pages: 91
📝 Text chunks collected: 91
📊 Tables extracted: 38


In [None]:
# Combine all text into one string (if desired) or chunk per page
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,           # tokens/chars per chunk
    chunk_overlap=50,         # for context preservation
    separators=["\n\n", "\n", ".", " ", ""]
)

# Apply chunking
all_chunks = []
for page_text in all_text:
    if page_text:  # skip empty pages
        chunks = text_splitter.split_text(page_text)
        all_chunks.extend(chunks)

print(f"✅ Total chunks created: {len(all_chunks)}")
print(f"🧩 Sample chunk:\n{all_chunks[0][:300]}")


✅ Total chunks created: 712
🧩 Sample chunk:
Llama 2: Open Foundation and Fine-Tuned Chat Models
HugoTouvron∗ LouisMartin† KevinStone†
PeterAlbert AmjadAlmahairi YasmineBabaei NikolayBashlykov SoumyaBatra
PrajjwalBhargava ShrutiBhosale DanBikel LukasBlecher CristianCantonFerrer MoyaChen
GuillemCucurull DavidEsiobu JudeFernandes JeremyFu Wenyin


In [None]:
embedding_model = OpenAIEmbeddings()  # requires OPENAI_API_KEY in .env
collection_name = "rag_docs_collection"
query = "What were the response rates, remission results, or survival outcomes in the study?"
embedding_model = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4", temperature=0)

connections.connect(
    alias="default",
    host=ZILLIZ_HOST,
    port=443,
    secure=True,
    token=ZILLIZ_API_KEY
)

collection = Collection(collection_name)


In [44]:
# Types of vector indexes to test
index_configs = {
    "flat": {"index_type": "FLAT", "metric_type": "L2", "params": {}},
    "hnsw": {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 8, "efConstruction": 64}},
    "ivf_flat": {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
}

# Search parameters for each index type
search_param_map = {
    "flat": {"metric_type": "L2"},
    "hnsw": {"metric_type": "L2", "params": {"ef": 32}},
    "ivf_flat": {"metric_type": "L2", "params": {"nprobe": 10}},
}


In [45]:
# Measure average retrieval speed
def benchmark_retriever(retriever, query, runs=5):
    times = []
    for _ in range(runs):
        start = time.time()
        _ = retriever.get_relevant_documents(query)
        times.append(time.time() - start)
    return sum(times) / runs

# Measure how relevant the retrieved results are
def evaluate_precision_at_k(retriever, query, expected_keywords, k=5):
    docs = retriever.get_relevant_documents(query)
    hits = sum(1 for doc in docs if any(kw in doc.page_content.lower() for kw in expected_keywords))
    return hits / k


In [46]:
expected_keywords = ["outcome", "survival", "response", "remission", "improvement"]
benchmark_results = {}

for index_name, index_params in index_configs.items():
    print(f"\n🔄 Testing index: {index_name.upper()}")

    # Drop old index if exists
    try:
        collection.release()
        collection.drop_index()
        print("🗑️ Dropped old index.")
    except Exception as e:
        print(f"⚠️ Skipped drop: {e}")

    # Create new index
    collection.create_index(field_name="embedding", index_params=index_params)
    collection.load()
    print("✅ Index created.")

    # Setup vector store
    vectorstore = Zilliz(
        embedding_function=embedding_model,
        collection_name=collection_name,
        connection_args={
            "host": ZILLIZ_HOST,
            "port": 443,
            "secure": True,
            "token": ZILLIZ_API_KEY,
        },
        vector_field="embedding",
        text_field="content",
        auto_id=True,
        index_params={},     # Placeholder
        search_params={}     # Placeholder
    )

    # Setup retriever with current index’s params
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5, "search_params": search_param_map[index_name]}
    )

    # Evaluate speed and relevance
    avg_time = benchmark_retriever(retriever, query)
    precision = evaluate_precision_at_k(retriever, query, expected_keywords)

    benchmark_results[index_name.upper()] = {
        "retrieval_time": avg_time,
        "precision@5": precision
    }

    print(f"⏱️ Avg Time: {avg_time:.4f} sec")
    print(f"🎯 Precision@5: {precision:.2f}")



🔄 Testing index: FLAT
🗑️ Dropped old index.
✅ Index created.
⏱️ Avg Time: 0.4693 sec
🎯 Precision@5: 0.20

🔄 Testing index: HNSW
🗑️ Dropped old index.
✅ Index created.
⏱️ Avg Time: 0.3249 sec
🎯 Precision@5: 0.20

🔄 Testing index: IVF_FLAT
🗑️ Dropped old index.
✅ Index created.
⏱️ Avg Time: 0.2727 sec
🎯 Precision@5: 0.20


In [47]:
print("\n📊 Final Benchmark Summary")
for name, metrics in benchmark_results.items():
    print(f"🔹 {name}:")
    print(f"   ⏱️ Retrieval Time: {metrics['retrieval_time']:.4f} sec")
    print(f"   🎯 Precision@5:    {metrics['precision@5']:.2f}")



📊 Final Benchmark Summary
🔹 FLAT:
   ⏱️ Retrieval Time: 0.4693 sec
   🎯 Precision@5:    0.20
🔹 HNSW:
   ⏱️ Retrieval Time: 0.3249 sec
   🎯 Precision@5:    0.20
🔹 IVF_FLAT:
   ⏱️ Retrieval Time: 0.2727 sec
   🎯 Precision@5:    0.20


In [49]:
# Pick your best retriever or rerun using MMR
vectorstore = Zilliz(
    embedding_function=embedding_model,
    collection_name=collection_name,
    connection_args={
        "host": ZILLIZ_HOST,
        "port": 443,
        "secure": True,
        "token": ZILLIZ_API_KEY,
    },
    vector_field="embedding",
    text_field="content",
    primary_field="id",   # 👈 Make sure this matches the actual primary key field
    auto_id=True,
)

retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 15, "lambda_mult": 0.5}
)

top_docs = retriever.get_relevant_documents(query)
context = "\n\n".join(doc.page_content for doc in top_docs)


In [50]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that summarizes scientific medical findings."),
    ("user", "Using the following context:\n\n{context}\n\nAnswer the question:\n{question}")
])

response_chain = prompt | llm
final_answer = response_chain.invoke({"context": context, "question": query})

print("✅ LLM Response:\n", final_answer.content)

doc = DocxDocument()
doc.add_heading("LLM Response to Study Summary", level=1)
doc.add_paragraph(final_answer.content)
doc.save("llm_output_summary.docx")
print("📄 Saved: llm_output_summary.docx")


✅ LLM Response:
 The provided text does not provide specific information on response rates, remission results, or survival outcomes in the study. The text primarily discusses the performance decrease in a model when excluding certain information, such as surgery type and N stage. However, without additional context, it's not possible to provide a summary of the response rates, remission results, or survival outcomes.
📄 Saved: llm_output_summary.docx
