In [28]:
%pip install llama-index 
%pip install llama-index-llms-gemini
%pip install pymupdf
%pip install nest_asyncio
%pip install matplotlib
%pip install python-dotenv
%pip install llama-index-embeddings-huggingface

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
ERROR: unknown command "installllama-index-embeddings-huggingface"
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl.metadata (458 bytes)
Collecting huggingface-hub>=0.19.0 (from huggingface-hub[inference]>=0.19.0->llama-index-embeddings-huggingface)
  Downloading huggingface_hub-1.1.7-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers>=2.6.1 (from llama-index-embeddings-huggingface)
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata

In [10]:
import os
import fitz
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import nest_asyncio
from dotenv import load_dotenv

nest_asyncio.apply()

In [15]:
load_dotenv()

GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

if GOOGLE_API_KEY is None:
    raise ValueError("GOOGLE_API_KEY not found. Did you create a .env file?")

first load and extract text

In [20]:
import shutil
from pathlib import Path

def upload_pdf(pdf_path: str = None) -> str:
    """
    Select a local PDF file and copy it into the sample_docs directory.
    Returns the path to the copied PDF.
    """
    if pdf_path is None:
        pdf_path = input("Enter the full or relative path to your PDF file: ").strip()

    src = Path(pdf_path)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")
    if src.suffix.lower() != ".pdf":
        raise ValueError(f"File is not a PDF: {src}")

    dest_dir = Path("sample_docs")
    dest_dir.mkdir(parents=True, exist_ok=True)

    dest = dest_dir / src.name
    shutil.copy(src, dest)

    print(f"PDF copied to {dest}")
    return str(dest)


In [19]:
pdf_path = upload_pdf()

PDF copied to sample_docs/LenderFeesWorksheetNew.pdf


In [23]:
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file using PyMuPDF."""
    with fitz.open(pdf_path) as doc:
        text = "\n".join(page.get_text() for page in doc)

        print(f"PDF: {pdf_path}")
        print(f"Number of pages: {len(doc)}")
        print(f"Extracted {len(text.split())} words from the PDF.")

    return text


In [24]:
if pdf_path:
    text = extract_text_from_pdf(pdf_path)
    print(text[:500])

PDF: sample_docs/LenderFeesWorksheetNew.pdf
Number of pages: 1
Extracted 404 words from the PDF.
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.
Fee Details and Summary
Applicants:
Application No:
Date Prepared:
Loan Program:
Prepared By:
THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist
you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage 
payment. Actual charges may be more or less, and your transac


set up a custom loader to integrate PyMuPDF with LlamaIndex

In [25]:
from typing import List
from llama_index.core import Document

def load_pdf_with_pymupdf(pdf_path: str) -> List[Document]:
    documents: List[Document] = []

    with fitz.open(pdf_path) as doc:
        total_pages = len(doc)

        for i, page in enumerate(doc):
            text = page.get_text()

            if not text.strip():
                continue

            documents.append(
                Document(
                    text=text,
                    metadata={
                        "file_name": os.path.basename(pdf_path),
                        "page_number": i + 1,
                        "total_pages": total_pages
                    },
                )
            )

    print(f"Processed {pdf_path}:")
    print(f"Extracted {len(documents)} pages with content")

    return documents


In [26]:
pdf_docs = load_pdf_with_pymupdf(pdf_path)

Processed sample_docs/LenderFeesWorksheetNew.pdf:
Extracted 1 pages with content


create the indexing infrastructure

In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

llm = Gemini(model="models/gemini-1.5-flash")
Settings.llm = llm

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed_model

def process_and_index_pdf(pdf_path: str) -> VectorStoreIndex:
    documents = load_pdf_with_pymupdf(pdf_path)
    vector_index = VectorStoreIndex.from_documents(documents)
    print(f"Indexed {len(documents)} document chunks")
    return vector_index

In [None]:
index = process_and_index_pdf(pdf_path)

In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings

llm = Gemini(model="models/gemini-1.5-flash")
Settings.llm = llm

def expand_query(query: str, num_expansions: int = 3) -> list:
    prompt = f"""
    I need to search a legal contract with this query: "{query}"

    Please help me expand this query by generating {num_expansions} alternative versions that:
    1. Use different but related terminology
    2. Include relevant legal terms that might appear in a contract
    3. Cover similar concepts but phrased differently

    Format your response as a list of alternative queries only, with no additional text.
    """
    response = llm.complete(prompt)
    expanded_queries = [line.strip() for line in response.text.split("\n") if line.strip()]
    if query not in expanded_queries:
        expanded_queries = [query] + expanded_queries
    return expanded_queries


In [None]:
expanded = expand_query("What are the penalties for late payments?")
for i, q in enumerate(expanded):
    print(f"{i+1}. {q}")

structured query expansion using LlamaIndex's built-in functionality

In [31]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever

def create_query_expansion_engine(index):
    base_retriever = index.as_retriever(similarity_top_k=2)

    fusion_retriever = QueryFusionRetriever(
        retrievers=[base_retriever],
        llm=llm,
        similarity_top_k=2,
        num_queries=3,
        mode="reciprocal_rerank"
    )

    query_engine = RetrieverQueryEngine.from_args(
        retriever=fusion_retriever,
        llm=llm,
        verbose=True
    )

    return query_engine


In [None]:
expanded_query_engine = create_query_expansion_engine(index)
response = expanded_query_engine.query("What are the penalties for late payments?")
print(response)

hybrid retrieval combines embedding-based semantic search with keyword-based retrieval for better results

In [34]:
%pip install llama-index-retrievers-bm25

Collecting llama-index-retrievers-bm25
  Downloading llama_index_retrievers_bm25-0.6.5-py3-none-any.whl.metadata (446 bytes)
Collecting bm25s>=0.2.7.post1 (from llama-index-retrievers-bm25)
  Downloading bm25s-0.2.14-py3-none-any.whl.metadata (21 kB)
Collecting pystemmer<3,>=2.2.0.1 (from llama-index-retrievers-bm25)
  Downloading PyStemmer-2.2.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.2 kB)
Downloading llama_index_retrievers_bm25-0.6.5-py3-none-any.whl (5.0 kB)
Downloading PyStemmer-2.2.0.3-cp312-cp312-macosx_11_0_arm64.whl (220 kB)
Downloading bm25s-0.2.14-py3-none-any.whl (55 kB)
Installing collected packages: pystemmer, bm25s, llama-index-retrievers-bm25
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [llama-index-retrievers-bm25]
[1A[2KSuccessfully installed bm25s-0.2.14 llama-index-retrievers-bm25-0.6.5 pystemmer-2.2.0.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.retrievers.bm25 import BM25Retriever

def create_hybrid_retriever(index, query, top_k: int = 2):
    vector_retriever = index.as_retriever(similarity_top_k=top_k)
    vector_nodes = vector_retriever.retrieve(query)

    nodes = list(index.docstore.docs.values())
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=top_k
    )
    keyword_nodes = bm25_retriever.retrieve(query)

    all_nodes = list(vector_nodes) + list(keyword_nodes)

    unique_nodes = []
    seen_ids = set()
    for node in all_nodes:
        if node.node_id not in seen_ids:
            unique_nodes.append(node)
            seen_ids.add(node.node_id)

    sorted_nodes = sorted(
        unique_nodes,
        key=lambda x: x.score if hasattr(x, "score") else 0.0,
        reverse=True,
    )

    return sorted_nodes[:top_k]

In [None]:
hybrid_nodes = create_hybrid_retriever(index, "What is the refund policy?")
for i, node in enumerate(hybrid_nodes):
    print(f"Result {i+1} (Score: {node.score:.4f}):")
    print(node.get_text())
    print("-" * 40)

create a function to compare different retrieval methods

In [37]:
def compare_retrieval_methods(index, query: str, top_k: int = 2):
    vector_retriever = index.as_retriever(similarity_top_k=top_k)
    vector_nodes = vector_retriever.retrieve(query)

    nodes = list(index.docstore.docs.values())
    keyword_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=top_k
    )
    keyword_nodes = keyword_retriever.retrieve(query)

    hybrid_nodes = create_hybrid_retriever(index, query, top_k)

    results = []

    for method, nodes_list in [
        ("Vector (Semantic)", vector_nodes),
        ("Keyword (BM25)", keyword_nodes),
        ("Hybrid", hybrid_nodes),
    ]:
        for i, node in enumerate(nodes_list):
            results.append(
                {
                    "Method": method,
                    "Rank": i + 1,
                    "Score": node.score if hasattr(node, "score") else 0.0,
                    "Content": node.get_text()[:200] + "...",
                    "Page": node.metadata.get("page_number", "Unknown")
                    if hasattr(node, "metadata")
                    else "Unknown",
                }
            )

    results_df = pd.DataFrame(results)
    display(results_df)

    plt.figure(figsize=(10, 6))
    for method in ["Vector (Semantic)", "Keyword (BM25)", "Hybrid"]:
        method_df = results_df[results_df["Method"] == method]
        plt.bar(
            [f"{method} - Rank {row['Rank']}" for _, row in method_df.iterrows()],
            method_df["Score"],
            alpha=0.7,
            label=method,
        )

    plt.xlabel("Result")
    plt.ylabel("Retrieval Score")
    plt.title(f"Comparison of Retrieval Methods for Query: '{query}'")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    return results_df


In [None]:
comparison = compare_retrieval_methods(index, "What is the refund policy?")

reranking

In [39]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import NodeWithScore

def rerank_results(nodes, query: str, top_n: int = 2):
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-6-v2",
        top_n=top_n
    )
    reranked_nodes = reranker.postprocess_nodes(
        nodes,
        query_str=query
    )
    return reranked_nodes

def demonstrate_reranking(index, query: str, top_k: int = 4):
    retriever = index.as_retriever(similarity_top_k=top_k)
    nodes = retriever.retrieve(query)

    print(f"Query: {query}")
    print("\nOriginal Retrieval Order:")
    for i, node in enumerate(nodes):
        print(f"{i+1}. (Score: {node.score:.4f}) - {node.get_text()[:100]}...")

    reranked_nodes = rerank_results(nodes, query, top_n=2)

    print("\nAfter Reranking:")
    for i, node in enumerate(reranked_nodes):
        print(f"{i+1}. (Score: {node.score:.4f}) - {node.get_text()[:100]}...")

    results = []

    for i, node in enumerate(nodes):
        results.append({
            "Stage": "Original Retrieval",
            "Rank": i + 1,
            "Score": node.score,
            "Content": node.get_text()[:150] + "...",
            "Page": node.metadata.get("page_number", "Unknown"),
        })

    for i, node in enumerate(reranked_nodes):
        results.append({
            "Stage": "After Reranking",
            "Rank": i + 1,
            "Score": node.score,
            "Content": node.get_text()[:150] + "...",
            "Page": node.metadata.get("page_number", "Unknown"),
        })

    results_df = pd.DataFrame(results)
    display(results_df)

    return results_df


In [None]:
reranking_demo = demonstrate_reranking(index, "What happens if I cancel the service?", top_k=4)

advanced RAG pipeline

In [41]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever

def build_rag_pipeline(index):
    nodes = list(index.docstore.docs.values())
    num_nodes = len(nodes)
    safe_top_k = min(2, max(1, num_nodes))

    print(f"Index contains {num_nodes} nodes, using top_k={safe_top_k}")

    vector_retriever = index.as_retriever(
        similarity_top_k=safe_top_k
    )

    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=safe_top_k
    )

    class HybridRetriever(BaseRetriever):
        def __init__(self, vector_retriever, keyword_retriever, top_k: int = 2):
            self.vector_retriever = vector_retriever
            self.keyword_retriever = keyword_retriever
            self.top_k = top_k
            super().__init__()

        def _retrieve(self, query_bundle, **kwargs):
            vector_nodes = self.vector_retriever.retrieve(query_bundle)
            keyword_nodes = self.keyword_retriever.retrieve(query_bundle)
            all_nodes = list(vector_nodes) + list(keyword_nodes)

            unique_nodes = {}
            for node in all_nodes:
                if node.node_id not in unique_nodes:
                    unique_nodes[node.node_id] = node

            sorted_nodes = sorted(
                unique_nodes.values(),
                key=lambda x: x.score if hasattr(x, "score") else 0.0,
                reverse=True,
            )

            return sorted_nodes[:self.top_k]

    hybrid_retriever = HybridRetriever(
        vector_retriever=vector_retriever,
        keyword_retriever=bm25_retriever,
        top_k=safe_top_k,
    )

    node_postprocessors = []
    if num_nodes > 1:
        reranker = SentenceTransformerRerank(
            model="cross-encoder/ms-marco-MiniLM-L-6-v2",
            top_n=min(2, num_nodes),
        )
        node_postprocessors = [reranker]

    query_engine = RetrieverQueryEngine.from_args(
        retriever=hybrid_retriever,
        llm=llm,
        node_postprocessors=node_postprocessors,
    )

    return query_engine


In [None]:
index = process_and_index_pdf(pdf_path)
rag_engine = build_rag_pipeline(index)
response = rag_engine.query("What are the penalties for late payments?")
print('\nFinal Response:\n ---------------------- \n')
print(response)