In [5]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from sentence_transformers import CrossEncoder
import re
import os
import textwrap

## LOAD DATA

In [6]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/False_or_misleading_statements_by_Donald_Trump")
docs = loader.load()

for doc in docs:
    doc.page_content = re.sub(r'\n{3,}', '\n\n', doc.page_content) #remove access white/blank spaces
print(docs[0].page_content[:100])



False or misleading statements by Donald Trump - Wikipedia

Jump to content

Main menu

Main menu



In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                               chunk_overlap=200,
                                               separators=["\n\n","\n",". "," ",""])
splits = text_splitter.split_documents(docs)
print(len(splits))
splits[:10]

611


[Document(metadata={'source': 'https://en.wikipedia.org/wiki/False_or_misleading_statements_by_Donald_Trump', 'title': 'False or misleading statements by Donald Trump - Wikipedia', 'language': 'en'}, page_content='False or misleading statements by Donald Trump - Wikipedia\n\nJump to content\n\nMain menu\n\nMain menu\nmove to sidebar\nhide\n\n\t\tNavigation\n\t\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\t\tContribute\n\t\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\nSearch\n\nSearch\n\nAppearance\n\nDonate\n\nCreate account\n\nLog in\n\nPersonal tools\n\nDonate Create account Log in\n\n\t\tPages for logged out editors learn more\n\nContributionsTalk\n\nContents\nmove to sidebar\nhide\n\n(Top)\n\n1\nVeracity and politics\n\nToggle Veracity and politics subsection\n\n1.1\nRepetition\n\n1.2\nBullshit\n\n2\nBusiness career\n\nToggle Business career subsection\n\n2.1\nReal estate\n\n2.2\nOther investments and debt\n\n2.3\nPhi

## CREATE STORE & RERANKER

In [8]:
class Storage:
    def __init__(self):
        self.store = None
        
    def create_store(self, documents):
        self.store = FAISS.from_documents(
            documents=documents,  
            embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        )
        
    def get_retriever(self): 
        if self.store is None:
            raise ValueError("Store not initialized. Call create_store first.")
        return self.store.as_retriever()
    
storage = Storage()
storage.create_store(splits)

  embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [19]:
def rerank_docs(reranker, query, docs, top_k=3):
    pairs = [(query, doc.page_content) for doc in docs]
    
    scores = reranker.predict(pairs)
    
    scored_docs = [(doc, score) for doc, score in zip(docs, scores)]
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    reranked_docs = [doc for doc, _ in scored_docs[:top_k]]
    
    def wrap_text(text, width=80):
        return "\n".join(textwrap.fill(line, width=width) for line in text.splitlines())

    print("\n=== Original Retrieved Documents (query + content) ===")
    for i, (q, doc_text) in enumerate(pairs):
        print(f"\n[Doc {i+1}] Query:")
        print(wrap_text(q))
        print("Content:")
        print(wrap_text(doc_text[:500] + "..." if len(doc_text) > 500 else doc_text))  

    print("\n=== Reranked Documents (Top K) ===")
    for i, doc in enumerate(reranked_docs):
        print(f"\n[Reranked Doc {i+1}]")
        print(wrap_text(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content))

    return reranked_docs

In [23]:
def make_reranking_retriever(reranker_model):
    def retrieve_and_rerank(query: str):
        retriever = storage.get_retriever()
        initial_docs = retriever.invoke(query)
        return rerank_docs(reranker_model, query, initial_docs, top_k=3)
    return RunnableLambda(retrieve_and_rerank)

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
reranking_retriever = make_reranking_retriever(reranker)

## INVOKE RAG

In [24]:
template = """
You are an expert assistant that answers questions strictly based on the provided context.  
Follow these rules:

1. Use only the information in the context below to answer the question.
2. If the context does not contain enough information, respond with: "I don't know based on the provided context."
3. Keep your answer concise, clear, and factual.
4. Do not speculate, infer beyond the text, or use outside knowledge.

Context:
{context}

Question: {question}

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash", 
    google_api_key=os.getenv("gemini_key")
)


rag_chain = (
    {"context": reranking_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [26]:
result = rag_chain.invoke("How many false statements did trump make?")
wrapped = textwrap.fill(result, width=80) 
print("\n=== RAG RESULT ===")
print(wrapped)


=== Original Retrieved Documents (query + content) ===

[Doc 1] Query:
How many false statements did trump make?
Content:
Commentary and analysis
As president, Trump frequently made false statements in public speeches and
remarks.[167][137][168][169] Trump uttered "at least one false or misleading
claim per day on 91 of his first 99 days" in office according to The New York
Times,[167] and 1,318 total in his first 263 days in office according to the
"Fact Checker" political analysis column of The Washington Post.[170] By the
Post's tally, it took Trump 601 days to reach 5,000 false or misleading
statements and anot...

[Doc 2] Query:
How many false statements did trump make?
Content:
During his 2024 presidential campaign, Trump has made numerous false and
misleading statements.[477][478][479] The large amount of lies and false
statements have been attributed to Trump's rhetorical style described as using
the big lie and firehose of falsehood propaganda technique.[21] During a 64
minut