<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/est%2C_Compare%2C_and_Choose_the_Best_Embedding_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ============================
# EMBEDDING MODEL COMPARISON (Option 1)
# Compare retrieval quality across MiniLM vs E5 vs BGE
# ✅ NO Gemini calls during comparison (avoids 429 rate limits)
# After you pick the best model, you can optionally run Gemini ONCE at the end.
# ============================

!pip -q install -U llama-index llama-index-embeddings-huggingface google-generativeai pymupdf

# ---------- 1) Upload PDF ----------
from google.colab import files
uploaded = files.upload()
pdf_name = list(uploaded.keys())[0]
print("Uploaded:", pdf_name)

# ---------- 2) Parse PDF into LlamaIndex Documents ----------
import fitz
from llama_index.core import Document

pdf = fitz.open(pdf_name)

documents = []
for i, page in enumerate(pdf):
    text = page.get_text("text")
    documents.append(
        Document(
            text=text,
            metadata={"source": pdf_name, "page": i + 1}
        )
    )

print("Parsed pages:", len(documents))
print("Sample text:\n", documents[0].text[:600])

# ---------- 3) Chunking ----------
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=450, chunk_overlap=75)
nodes = splitter.get_nodes_from_documents(documents)
print("Total chunks:", len(nodes))

# ---------- 4) Define Queries (same across all models) ----------
queries = [
    "What is the total estimated monthly payment?",
    "How much does the borrower pay for lender's title insurance?",
    "What fees are included in the total closing costs?"
]

# ---------- 5) Choose 3 Small Embedding Models ----------
# MiniLM: fast baseline
# E5: strong for question-style retrieval
# BGE: strong general retrieval
embedding_models = {
    "MiniLM (all-MiniLM-L6-v2)": "sentence-transformers/all-MiniLM-L6-v2",
    "E5-small-v2": "intfloat/e5-small-v2",
    "BGE-small-en-v1.5": "BAAI/bge-small-en-v1.5",
}

# ---------- 6) Build Index + Print Retrieved Chunks (NO GEMINI) ----------
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex

def compare_embeddings(model_label: str, hf_model_name: str):
    print("\n" + "#" * 110)
    print(f"EMBEDDING MODEL: {model_label}")
    print(f"HuggingFace name: {hf_model_name}")
    print("#" * 110)

    # embeddings
    embed_model = HuggingFaceEmbedding(model_name=hf_model_name)

    # index built from your chunked nodes
    index = VectorStoreIndex(nodes, embed_model=embed_model)

    # retriever
    retriever = index.as_retriever(similarity_top_k=3)

    # run the same queries for each model
    for q in queries:
        print("\n" + "=" * 100)
        print("QUERY:", q)

        retrieved = retriever.retrieve(q)

        print("\nRETRIEVED CHUNKS (top 3):")
        for j, r in enumerate(retrieved, start=1):
            meta = r.node.metadata
            content = r.node.get_content().replace("\n", " ").strip()
            print(f"\n[{j}] page {meta.get('page')} | score {r.score:.3f}")
            print(content[:650] + ("..." if len(content) > 650 else ""))

    return index  # return index in case you want to use it later

# Run comparison for all models
indexes = {}
for label, model_name in embedding_models.items():
    indexes[label] = compare_embeddings(label, model_name)

print("\n✅ Comparison complete. Use the retrieved chunks above to score each model in your Google Doc.")

# =====================================================================================
# OPTIONAL STEP (Run Gemini ONCE after you choose the best embedding model)
# =====================================================================================
# 1) Look at the retrieved chunks for each model above.
# 2) Choose the best model label (copy/paste its name into BEST_MODEL_LABEL below).
# 3) Uncomment this section to generate final answers one time.

"""
!pip -q install -U llama-index-llms-gemini

import os
import google.generativeai as genai
from llama_index.llms.gemini import Gemini
from llama_index.core.query_engine import RetrieverQueryEngine

# Paste your API key in Colab (do not share publicly)
os.environ["GOOGLE_API_KEY"] = "PASTE_YOUR_GOOGLE_API_KEY_HERE"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

valid_models = [m.name for m in genai.list_models() if "generateContent" in m.supported_generation_methods]
MODEL_NAME = valid_models[0]
llm = Gemini(model=MODEL_NAME)

BEST_MODEL_LABEL = "E5-small-v2"  # <-- change to the one you picked based on retrieval quality
best_index = indexes[BEST_MODEL_LABEL]
best_retriever = best_index.as_retriever(similarity_top_k=5)

query_engine = RetrieverQueryEngine.from_args(retriever=best_retriever, llm=llm)

final_prompts = [
    "What is the total estimated monthly payment?",
    "How much does the borrower pay for lender's title insurance?"
]

for q in final_prompts:
    print("\n" + "=" * 100)
    print("FINAL QUESTION:", q)
    resp = query_engine.query(q)
    print("FINAL ANSWER:\n", resp)
"""


Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew (1).pdf
Uploaded: LenderFeesWorksheetNew (1).pdf
Parsed pages: 1
Sample text:
 Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.
Fee Details and Summary
Applicants:
Application No:
Date Prepared:
Loan Program:
Prepared By:
THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist
you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage 
payment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.
Total Loan Amount:  
Interest Rate:
Term/Due In:
F
Total chunks: 3

##############################################################################################################
EMBEDDING MODEL: MiniLM (all-MiniLM-L6-v2)
HuggingFace name: sentence-transformers/all-MiniLM-L6-v2
################################

'\n!pip -q install -U llama-index-llms-gemini\n\nimport os\nimport google.generativeai as genai\nfrom llama_index.llms.gemini import Gemini\nfrom llama_index.core.query_engine import RetrieverQueryEngine\n\n# Paste your API key in Colab (do not share publicly)\nos.environ["GOOGLE_API_KEY"] = "PASTE_YOUR_GOOGLE_API_KEY_HERE"\ngenai.configure(api_key=os.environ["GOOGLE_API_KEY"])\n\nvalid_models = [m.name for m in genai.list_models() if "generateContent" in m.supported_generation_methods]\nMODEL_NAME = valid_models[0]\nllm = Gemini(model=MODEL_NAME)\n\nBEST_MODEL_LABEL = "E5-small-v2"  # <-- change to the one you picked based on retrieval quality\nbest_index = indexes[BEST_MODEL_LABEL]\nbest_retriever = best_index.as_retriever(similarity_top_k=5)\n\nquery_engine = RetrieverQueryEngine.from_args(retriever=best_retriever, llm=llm)\n\nfinal_prompts = [\n    "What is the total estimated monthly payment?",\n    "How much does the borrower pay for lender\'s title insurance?"\n]\n\nfor q in fin