<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Run_3_RAG_Configurations_and_Log_Output_Differences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================
# RETRIEVER CONFIG EXPERIMENT (A/B/C)
# LlamaIndex RAG + Gemini
# Varies: top_k, similarity threshold, reranker
# Prints: retrieved chunks + final answer per config
# ============================

!pip -q install -U llama-index llama-index-llms-gemini llama-index-embeddings-huggingface google-generativeai pymupdf

# ---------- 1) Upload PDF ----------
from google.colab import files
uploaded = files.upload()
pdf_name = list(uploaded.keys())[0]
print("Uploaded:", pdf_name)

# ---------- 2) Parse PDF -> LlamaIndex Documents ----------
import fitz
from llama_index.core import Document

pdf = fitz.open(pdf_name)
documents = []
for i, page in enumerate(pdf):
    text = page.get_text("text")
    documents.append(Document(text=text, metadata={"source": pdf_name, "page": i+1}))

print("Parsed pages:", len(documents))

# ---------- 3) Chunking ----------
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=450, chunk_overlap=75)
nodes = splitter.get_nodes_from_documents(documents)
print("Total chunks:", len(nodes))

# ---------- 4) Embeddings (pick ONE model; keep constant for this experiment) ----------
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex

embed_model = HuggingFaceEmbedding(model_name="intfloat/e5-small-v2")
index = VectorStoreIndex(nodes, embed_model=embed_model)

# ---------- 5) Gemini Setup (auto-pick valid model) ----------
import os
import google.generativeai as genai

os.environ["GOOGLE_API_KEY"] = "AIzaSyBLwKB5OTSoqhxKIYv5n366vey0F2tOUbo"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

valid_models = [m.name for m in genai.list_models() if "generateContent" in m.supported_generation_methods]
if not valid_models:
    raise RuntimeError("No Gemini models available for this API key. Check access.")
MODEL_NAME = valid_models[0]
print("Using Gemini model:", MODEL_NAME)

from llama_index.llms.gemini import Gemini
llm = Gemini(model=MODEL_NAME)

# ---------- 6) (Optional) Reranker ----------
# LlamaIndex reranking uses an LLM to reorder retrieved chunks
from llama_index.core.postprocessor import LLMRerank

# ---------- 7) Choose ONE query to test (use the same for all configs) ----------
query = "What is the total estimated monthly payment?"   # <-- replace if you want
print("\nTEST QUERY:", query)

# ---------- 8) Experiment configs ----------
experiments = {
    "A (Default)": {"top_k": 5, "apply_threshold": False, "threshold": None, "use_reranker": False},
    "B":           {"top_k": 8, "apply_threshold": True,  "threshold": 0.75, "use_reranker": False},
    "C":           {"top_k": 5, "apply_threshold": True,  "threshold": 0.75, "use_reranker": True},
}

# ---------- 9) Runner ----------
from llama_index.core.query_engine import RetrieverQueryEngine

def run_experiment(name, cfg):
    print("\n" + "=" * 110)
    print(f"CONFIG: {name}")
    print(cfg)
    print("=" * 110)

    # Retriever with top_k
    retriever = index.as_retriever(similarity_top_k=cfg["top_k"])
    retrieved = retriever.retrieve(query)

    # Optional threshold filtering
    if cfg["apply_threshold"]:
        retrieved = [n for n in retrieved if n.score is not None and n.score >= cfg["threshold"]]

    print(f"\nChunks retrieved after filtering: {len(retrieved)}")

    # Print retrieved chunks
    print("\nRETRIEVED CHUNKS:")
    for i, n in enumerate(retrieved, start=1):
        meta = n.node.metadata
        text = n.node.get_content().replace("\n", " ").strip()
        print(f"\nChunk {i} | page {meta.get('page')} | score {n.score:.3f}")
        print(text[:800] + ("..." if len(text) > 800 else ""))

    # Pick best chunk excerpt
    best_excerpt = ""
    if retrieved:
        best_excerpt = retrieved[0].node.get_content().replace("\n", " ").strip()[:180]

    # Query engine (optionally rerank)
    node_postprocessors = []
    if cfg["use_reranker"]:
        # Rerank top_n results (keep small to reduce calls/cost)
        node_postprocessors.append(LLMRerank(llm=llm, top_n=min(5, len(retrieved))))

    query_engine = RetrieverQueryEngine.from_args(
        retriever=retriever,
        llm=llm,
        node_postprocessors=node_postprocessors if node_postprocessors else None
    )

    # Generate answer
    answer = query_engine.query(query)

    print("\nFINAL ANSWER:")
    print(str(answer))

    return {
        "config": name,
        "chunks_retrieved": len(retrieved),
        "best_excerpt": best_excerpt,
        "answer_short": str(answer)[:220] + ("..." if len(str(answer)) > 220 else ""),
        # You fill in confidence based on correctness/clarity
        "confidence_placeholder": "___",
        "notes_placeholder": "___"
    }

results = []
for name, cfg in experiments.items():
    results.append(run_experiment(name, cfg))

# ---------- 10) Print a Google Doc-friendly table row summary ----------
print("\n\n" + "📝" * 20)
print("PASTE INTO GOOGLE DOC TABLE (rows)")
print("📝" * 20)

for r in results:
    print(f"{r['config']} | {r['chunks_retrieved']} | \"{r['best_excerpt']}\" | \"{r['answer_short']}\" | {r['confidence_placeholder']} | {r['notes_placeholder']}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew.pdf
Uploaded: LenderFeesWorksheetNew.pdf
Parsed pages: 1
Total chunks: 2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


Using Gemini model: models/gemini-2.5-flash

TEST QUERY: What is the total estimated monthly payment?

CONFIG: A (Default)
{'top_k': 5, 'apply_threshold': False, 'threshold': None, 'use_reranker': False}

Chunks retrieved after filtering: 2

RETRIEVED CHUNKS:

Chunk 1 | page 1 | score 0.846
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan. Fee Details and Summary Applicants: Application No: Date Prepared: Loan Program: Prepared By: THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage  payment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed. Total Loan Amount:   Interest Rate: Term/Due In: Fee Paid To Paid By (Fee Split**) Amount PFC / F / POC TOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED

  llm = Gemini(model=MODEL_NAME)



FINAL ANSWER:
The total estimated monthly payment is $1,869.37.

CONFIG: B
{'top_k': 8, 'apply_threshold': True, 'threshold': 0.75, 'use_reranker': False}

Chunks retrieved after filtering: 2

RETRIEVED CHUNKS:

Chunk 1 | page 1 | score 0.846
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan. Fee Details and Summary Applicants: Application No: Date Prepared: Loan Program: Prepared By: THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage  payment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed. Total Loan Amount:   Interest Rate: Term/Due In: Fee Paid To Paid By (Fee Split**) Amount PFC / F / POC TOTAL ESTIMATED FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT: Total Estimated Funds Total Mo