<a href="https://colab.research.google.com/github/Mosmoove/Mortage-Document-Analysis-Using-RAG/blob/main/Building_an_End_to_End_RAG_Pipeline_Using_LlamaIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing all required libraries

In [None]:
!pip install -q llama-index llama-index-llms-gemini pymupdf
!pip install -q llama-index-embeddings-huggingface
!pip install nest_asyncio
!pip install llama-index-retrievers-bm25

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Loading PDFs with PyMuPDF & LlamaIndex



> Using PyMuPDF allows for better text extraction with preserved structure






In [None]:
import os
import fitz #PyMuPDF
import pandas as pd
from IPython.display import display, Markdown
import nest_asyncio

#Setting up google API key
GEMINI_API_KEY = ""
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

#xreating a sample doc to store the PDF
!mkdir -p sample_docs

# Loading and Viewing the PDF Documents

### Loading and extracting text from a PDF using PyMuPDF:

In [None]:
from google.colab import files
import os

def loading_pdf():
  print("Please select a PDF to upload: ")
  uploaded = files.upload()

  for filename in uploaded.keys():
    if filename.endswith('.pdf'):

      pdf_file = os.path.join('sample_docs', filename)

      #create a directory if that doesn't exist
      os.makedirs('sample_docs', exist_ok=True)

      #save the file
      with open(pdf_file, 'wb') as file:
        file.write(uploaded[filename])
      print("PDF saved to {}".format(pdf_file))
      return pdf_file # Return the pdf_file path here
    else:
      print("Invalid file type. Please upload a PDF file.")
  return None

In [None]:
pdf_file = loading_pdf()

Please select a PDF to upload: 


Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew.pdf
PDF saved to sample_docs/LenderFeesWorksheetNew.pdf


In [None]:
def extract_text_from_pdf(pdf_file):
  document = fitz.open(pdf_file)

  text = "\n".join([page.get_text() for page in document])

  print(f"PDF: {pdf_file}")
  print(f"Number of pages: {len(document)}")
  print(f"Extracted {len(text.split())} words from the PDF.")

  #close the document
  document.close()

  return text

In [None]:
if pdf_file:
  text = extract_text_from_pdf(pdf_file)
  print(text[:300]) #printing the first 200 characters to make sure it's working

PDF: sample_docs/LenderFeesWorksheetNew.pdf
Number of pages: 1
Extracted 404 words from the PDF.
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.
Fee Details and Summary
Applicants:
Application No:
Date Prepared:
Loan Program:
Prepared By:
THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purpos


# Integrating PyMuPDF with LlamaIndex

In [None]:
from llama_index.core import Document
from typing import List

def loading_pdf_with_pymupdf(pdf_file: str) -> List[Document]:
  # Load the PDF and convert it to llamaindex document using PyMuPDF

  #opening the doc
  document = fitz.open(pdf_file)

  #extract the text from each page
  extracted_text = []

  for i, page in enumerate(document):
    text = page.get_text()

    #skip empty pages
    if not text.strip():
      continue

    extracted_text.append(
        Document(
            text=text,
            metadata={
                "page_number": i + 1,
                "file_name": os.path.basename(pdf_file),
                "total_pages": len(document)
            }
        )
    )
    #close the document
  document.close()
  print(f"Processed document: {pdf_file}")
  print(f"Extracted {len(extracted_text)} pages with content.")
  return extracted_text



In [None]:
documents = loading_pdf_with_pymupdf(pdf_file)

Processed document: sample_docs/LenderFeesWorksheetNew.pdf
Extracted 1 pages with content.


# Indexing and Processing PDFs

In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#initialize Gemini LLM
llm = Gemini(model='gemini-2.5-flash')
Settings.llm = llm

#initialize embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

def process_and_indexpdf(pdf_file):
  #load documents
  documents = loading_pdf_with_pymupdf(pdf_file)

  #create the index
  index = VectorStoreIndex.from_documents(documents)

  return index


  llm = Gemini(model='gemini-2.5-flash')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Implementing Query Expansion



> Query Expansion helps improve retrieval by adding relevant terms to the original query



In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings

llm = Gemini(model='gemini-2.5-flash')
Settings.llm = llm

def expand_query(query: str, num_expansions: int = 4) -> list:
  prompt = f"""
   I need to search a Lender Fee's worksheet with this query: "{query}"

    Please help me expand this query by generating {num_expansions} alternative versions that:
    1. Use different but related terminology
    2. Include relevant legal terms that might appear in a fee worksheet
    3. Cover similar concepts but phrased differently

    Format your response as a list of alternative queries only, with no additional text.
    """
  response = llm.complete(prompt)

  #Extract the expanded queries
  expanded_queries = [line.strip() for line in response.text.split('\n') if line.strip()]

  # add the original query if needed
  if query not in expanded_queries:
    expanded_queries.insert(0, query)

  return expanded_queries


  llm = Gemini(model='gemini-2.5-flash')


In [None]:
expanded = expand_query("What is the total estimated monthly payment?")
for i, query in enumerate(expanded):
  print(f'{query}')

What is the total estimated monthly payment?
*   What is the projected aggregate PITI (Principal, Interest, Taxes, Insurance) installment?
*   What is the borrower's total periodic financial obligation, including all recurring charges?
*   What is the anticipated total amortized monthly remittance, as per the loan terms?
*   What is the estimated total monthly liability disclosed in the fee schedule?


## Creating a Query Expansion Engine



> Implementing a more structured query engine using llamaindex



In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever

def create_query_expansion_engine(index):
  #first create a base retriever
  base_retriever = index.as_retriever(similarity_top_k=2)

  #create a query fusion retriever
  fusion_retriever = QueryFusionRetriever(
      retrievers=[base_retriever],
      llm = llm,
      similarity_top_k=2,
      num_queries=4, #generate 4 queries per original query
      mode='reciprocal_rerank'

  )
#creating a query engine with fusion retriever
  query_engine = RetrieverQueryEngine.from_args(
      retriever=fusion_retriever,
      llm=llm,
      verbose=True
  )

  return query_engine




In [None]:
index = process_and_indexpdf(pdf_file)
query_expansion = create_query_expansion_engine(index)
response = query_expansion.query("What is the total estimated monthly payment?")
print(response)

Processed document: sample_docs/LenderFeesWorksheetNew.pdf
Extracted 1 pages with content.
The total estimated monthly payment is $2,308.95.


# Implementing Hybrid Retrieval (Semantic + keyword search)

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.retrievers.bm25 import BM25Retriever

def hybrid_retrieval(index, query, top_k=2):
  #vector retrieval (semantic search)
  vector_retriever = index.as_retriever(similarity_top_k=top_k)
  vector_results = vector_retriever.retrieve(query)

  #BM25 retrieval (keyword search)
  nodes = [node for node in index.docstore.docs.values()]
  bm25_retriever = BM25Retriever.from_defaults(
      nodes=nodes,
      similarity_top_k=top_k

  )
  bm25_results = bm25_retriever.retrieve(query)

  #combining results
  combined_nodes = []
  combined_nodes.extend(vector_results)
  combined_nodes.extend(bm25_results)

  #removing any duplicates
  unique_nodes = []
  seen = set()
  for node in combined_nodes:
    if node.node_id not in seen:
      unique_nodes.append(node)
      seen.add(node.node_id)


  #sorting the nodes by priority of quality
  sorted_nodes = sorted(unique_nodes, key=lambda x: x.score if hasattr(x, 'score') else 0.0, reverse=True)

  top_nodes = sorted_nodes[:top_k] #limiting the nodes to just the top k results

  return top_nodes



In [None]:
hybrid_nodes = hybrid_retrieval(index, "What is the total estimated monthly payment?")
for i, node in enumerate(hybrid_nodes):
  print(f"Result {i+1} (Score: {node.score:.4f}):")
  print(node.get_text())
  print("-" * 40)

DEBUG:bm25s:Building index from IDs objects


Result 1 (Score: 0.7220):
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.
Fee Details and Summary
Applicants:
Application No:
Date Prepared:
Loan Program:
Prepared By:
THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist
you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage 
payment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.
Total Loan Amount:  
Interest Rate:
Term/Due In:
Fee
Paid To
Paid By (Fee Split**)
Amount
PFC / F / POC
TOTAL ESTIMATED FUNDS NEEDED TO CLOSE:
TOTAL ESTIMATED MONTHLY PAYMENT:
Total Estimated Funds
Total Monthly Payment
Purchase Price (+)
Alterations (+)
Land (+)
Refi (incl. debts to be paid off) (+)
Est. Prepaid Items/Reserves (+)
Est. Closing Costs (+)
Loan Amount (-)
Principal & Interest
Other Financing (P & I)
Hazard

# Implement Reranking For More Accurate Results

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.schema import NodeWithScore

def rerank_results(nodes, query, top_k=2):
  #creating a reranker
  reranker = SentenceTransformerRerank(
      model='cross-encoder/ms-marco-MiniLM-L-6-v2',
      top_n=top_k
  )
  #rereanking the nodes
  reranked_nodes = reranker.postprocess_nodes(
      nodes,
      query_str=query
  )
  return reranked_nodes

def demonstrate_reranking(index, query, top_k=2):

  # First retrieve more nodes than we need
  retriever = index.as_retriever(similarity_top_k=top_k)
  nodes = retriever.retrieve(query)

  print(f"Query: {query}")
  print("\nOriginal Retrieval Order:")
  for i, node in enumerate(nodes):
      print(f"{i+1}. (Score: {node.score:.4f}) - {node.get_text()[:100]}...")

    # Now rerank them
  reranked_nodes = rerank_results(nodes, query, top_k=2)

  print("\nAfter Reranking:")
  for i, node in enumerate(reranked_nodes):
      print(f"{i+1}. (Score: {node.score:.4f}) - {node.get_text()[:100]}...")

    # Create comparison dataframe
  results = []

    # Original ranking
  for i, node in enumerate(nodes):
      results.append({
          "Stage": "Original Retrieval",
          "Rank": i + 1,
          "Score": node.score,
          "Content": node.get_text()[:150] + "...",
          "Page": node.metadata.get("page_number", "Unknown")
        })

    # Reranked
  for i, node in enumerate(reranked_nodes):
      results.append({
          "Stage": "After Reranking",
          "Rank": i + 1,
          "Score": node.score,
          "Content": node.get_text()[:150] + "...",
          "Page": node.metadata.get("page_number", "Unknown")
        })

  results_df = pd.DataFrame(results)
  display(results_df)

  return results_df

reranking_demo= demonstrate_reranking(index, "What is the total estimated monthly payment?")

Query: What is the total estimated monthly payment?

Original Retrieval Order:
1. (Score: 0.8382) - Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


After Reranking:
1. (Score: -0.0984) - Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a...


Unnamed: 0,Stage,Rank,Score,Content,Page
0,Original Retrieval,1,-0.098442,"Your actual rate, payment, and cost could be h...",1
1,After Reranking,1,-0.098442,"Your actual rate, payment, and cost could be h...",1


# Constructing a RAG Pipeline


In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, QueryBundle

def build_rag_pipeline(index):

  #get all the nodes from the index docstore
  nodes = list(index.docstore.docs.values())

  num_nodes = len(nodes)
  safe_top_k = min(2, max(1, num_nodes))

  print(f"Index contains {num_nodes} nodes, using top_k={safe_top_k}")

  #creating a hybrid retriever to combine vector and keyword search
  #get the vector retriever search
  vector_retriever = index.as_retriever(similarity_top_k=safe_top_k)

  #create a keyword retriever search
  bm25_retriever = BM25Retriever.from_defaults(
      nodes=nodes,
      similarity_top_k=safe_top_k
  )

#creating a proper hybrid retriever class
  class HybridRetriever(BaseRetriever):

    def __init__(self, vector_retriever, bm25_retriever, top_k=2):
      self.vector_retriever = vector_retriever
      self.bm25_retriever = bm25_retriever
      self.top_k = top_k
      super().__init__()

    def _retrieve(self, query_bundle, **kwargs):
      #get the results from both retrievers
      vector_results = self.vector_retriever.retrieve(query_bundle)
      bm25_results = self.bm25_retriever.retrieve(query_bundle)

     #combine the results
      combined_results = list(vector_results) + list(bm25_results)

      unique_nodes = {}
      for node in combined_results:
        if node.node_id not in unique_nodes:
          unique_nodes[node.node_id] = node

      # sort them (the higher the better)
      sorted_nodes = sorted(unique_nodes.values(),
                            key=lambda x: x.score if hasattr(x, 'score') else 0.0,
                            reverse=True)

    #return the top k results
      return sorted_nodes[:self.top_k]

  hybrid_retrieval = HybridRetriever(
      vector_retriever=vector_retriever,
      bm25_retriever=bm25_retriever,
      top_k=safe_top_k
  )

  if num_nodes > 1:
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-6-v2",
        top_n = min(2, num_nodes)
    )
    node_postprocessor = [reranker]
  else:
    node_postprocessor = []

  #Building the query engine
  query_engine = RetrieverQueryEngine.from_args(
      retriever=hybrid_retrieval,
      llm=llm,
      node_postprocessors=node_postprocessor
  )
  return query_engine

In [None]:
index = process_and_indexpdf(pdf_file)
rag_engine = build_rag_pipeline(index)
response_number_1 = rag_engine.query("What is the total estimated monthly payment?")
response_number_2 = rag_engine.query("How much does the borrower pay for lender's title insurance?")

print("Final Response: \n")
print(response_number_1)
print(response_number_2)



Processed document: sample_docs/LenderFeesWorksheetNew.pdf
Extracted 1 pages with content.


DEBUG:bm25s:Building index from IDs objects


Index contains 1 nodes, using top_k=1
Final Response: 

The total estimated monthly payment is $2,308.95.
The borrower pays $650.00 for lender's title insurance.
