In [3]:
!pip install langchain chromadb faiss-cpu langchain_google_genai langchain_community tiktoken wikipedia

Collecting chromadb
  Using cached chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.8-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinu

# Wikipedia Retriever

In [4]:
from langchain_community.retrievers import WikipediaRetriever

In [7]:
# initialize the retriver (optionally can select the num of results and languge of result)

retriver = WikipediaRetriever(
    top_k_results=2,
    lang='en'
)

In [8]:
# define your querry
querry = "The geopolitical history of Pakistan and India from Chinese perspective"

# get the relevent Wikipedia documents
docs = retriver.invoke(querry)

In [9]:
docs

[Document(metadata={'title': 'United States aid to Pakistan', 'summary': 'The United States has been providing military aid and economic assistance to Pakistan for various purposes since 1948. In 2017, the U.S. stopped military aid to Pakistan, which was about US$2 billion per year. With U.S. military assistance suspended in 2018 and civilian aid reduced to about $300 million for 2022, Pakistani authorities have turned to other countries for help.', 'source': 'https://en.wikipedia.org/wiki/United_States_aid_to_Pakistan'}, page_content="The United States has been providing military aid and economic assistance to Pakistan for various purposes since 1948. In 2017, the U.S. stopped military aid to Pakistan, which was about US$2 billion per year. With U.S. military assistance suspended in 2018 and civilian aid reduced to about $300 million for 2022, Pakistani authorities have turned to other countries for help.\n\n\n== History ==\nFrom 1947 to 1958, under civilian leadership, the United Sta

In [11]:
# print the retrived content
for i, doc in enumerate(docs):
  print(f"\n___________Result {i+1}___________ ")
  print(doc.page_content) # turncate for display


___________Result 1___________ 
The United States has been providing military aid and economic assistance to Pakistan for various purposes since 1948. In 2017, the U.S. stopped military aid to Pakistan, which was about US$2 billion per year. With U.S. military assistance suspended in 2018 and civilian aid reduced to about $300 million for 2022, Pakistani authorities have turned to other countries for help.


== History ==
From 1947 to 1958, under civilian leadership, the United States provided Pakistan with modest economic aid and limited military assistance. During this period, Pakistan became a member of the South East Asian Treaty Organization (SEATO) and the Central Treaty Organization (CENTO), after a Mutual Defence Assistance Agreement signed in May 1954, which facilitated increased levels of both economic and military aid from the U.S.
In 1958, Ayub Khan led Pakistan's first military coup, becoming Chief Martial Law Administrator (CMLA) and later President until 1969. During hi

# Vector Store Retriever

In [18]:
from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document

In [14]:
from google.colab import userdata
api_key = userdata.get('GOOGLE_API_KEY')

In [13]:
# Step1: Source documents
documents = [
    Document(page_content="Lahore is the cultural capital of Pakistan, known for its rich history, vibrant arts, and historical architecture. Home to landmarks like the Badshahi Mosque, Lahore Fort, and Shalimar Gardens, the city reflects the grandeur of the Mughal era. It’s also famous for its food streets, educational institutions like LUMS and Punjab University, and lively festivals that attract people from all over the country."),
    Document(page_content="Karachi, the largest city of Pakistan, is the country's economic engine and main seaport. It’s a melting pot of cultures, home to a vibrant business environment, beautiful beaches like Clifton and Hawksbay, and key institutions like the Karachi Stock Exchange and major media houses."),
    Document(page_content="Islamabad, the capital city of Pakistan, is known for its modern architecture, organized layout, and lush greenery. It houses key government buildings, the iconic Faisal Mosque, and several parks and museums. It offers a peaceful contrast to the hustle of other big cities."),
    Document(page_content="Faisalabad is a major industrial center of Pakistan, often referred to as the Manchester of Pakistan due to its thriving textile industry. It plays a significant role in the country’s economy and is also known for its clock tower and historical markets."),
]

In [19]:
# Step2: Initialize embedding model

embedding_model = GoogleGenerativeAIEmbeddings(
    model = 'models/embedding-001',
    google_api_key = api_key
)


In [20]:
# Step3: Creating chroma vector store in memory

vectorstore = Chroma.from_documents(
    documents= documents,
    embedding= embedding_model,
    collection_name= 'retriver_collection'
)

In [21]:
# step4: convert vector store into retriver
retriver = vectorstore.as_retriever(search_kwargs={'k': 2})

In [22]:
querry = 'capital of Pakistan?'
results = retriver.invoke(querry)

In [23]:
results

[Document(metadata={}, page_content='Islamabad, the capital city of Pakistan, is known for its modern architecture, organized layout, and lush greenery. It houses key government buildings, the iconic Faisal Mosque, and several parks and museums. It offers a peaceful contrast to the hustle of other big cities.'),
 Document(metadata={}, page_content="Karachi, the largest city of Pakistan, is the country's economic engine and main seaport. It’s a melting pot of cultures, home to a vibrant business environment, beautiful beaches like Clifton and Hawksbay, and key institutions like the Karachi Stock Exchange and major media houses.")]

In [25]:
for i, doc in enumerate(results):
  print(f'\n______ Result {i+1} ______')
  print(doc.page_content)


______ Result 1 ______
Islamabad, the capital city of Pakistan, is known for its modern architecture, organized layout, and lush greenery. It houses key government buildings, the iconic Faisal Mosque, and several parks and museums. It offers a peaceful contrast to the hustle of other big cities.

______ Result 2 ______
Karachi, the largest city of Pakistan, is the country's economic engine and main seaport. It’s a melting pot of cultures, home to a vibrant business environment, beautiful beaches like Clifton and Hawksbay, and key institutions like the Karachi Stock Exchange and major media houses.


# MMR (Maximal Marginal Relevance)

In [26]:
# Each document is a short paragraph focused on a unique angle of the topic "AI in Healthcare."
docs = [
    Document(
        page_content="Artificial Intelligence is revolutionizing medical imaging by enabling faster and more accurate diagnoses. Algorithms can detect anomalies in X-rays, CT scans, and MRIs with precision that rivals human radiologists. This not only reduces diagnostic errors but also speeds up patient treatment.",
        metadata={"source": "Medical Imaging", "topic": "AI in Healthcare"}
    ),
    Document(
        page_content="Virtual health assistants powered by AI are becoming essential tools for patient engagement. These assistants can provide reminders for medication, answer common health questions, and schedule appointments, thereby reducing the burden on healthcare staff and improving patient care accessibility.",
        metadata={"source": "Virtual Assistants", "topic": "AI in Healthcare"}
    ),
    Document(
        page_content="Hospitals are using AI for predictive analytics to forecast patient admissions, optimize bed occupancy, and manage resources efficiently. This predictive approach helps avoid overcrowding and ensures better planning for surgeries and emergency care.",
        metadata={"source": "Hospital Management", "topic": "AI in Healthcare"}
    ),
    Document(
        page_content="While AI promises major improvements in healthcare, it also raises ethical concerns around data privacy, algorithmic bias, and informed consent. Ensuring fairness and transparency in AI-driven decisions is a key challenge for developers and healthcare institutions alike.",
        metadata={"source": "Ethics", "topic": "AI in Healthcare"}
    ),
    Document(
        page_content="AI accelerates the drug discovery process by identifying potential compounds and predicting their effectiveness before clinical trials. Companies like DeepMind and Insilico Medicine use AI models to significantly reduce time and cost in pharmaceutical development.",
        metadata={"source": "Drug Discovery", "topic": "AI in Healthcare"}
    )
]


In [29]:
from langchain_community.vectorstores.faiss import FAISS

# initalize embedding model
embedding_model = GoogleGenerativeAIEmbeddings(
    model = 'models/embedding-001',
    google_api_key = api_key
)

# create faiss vector store from documents
vectorstore = FAISS.from_documents(
    documents= docs,
    embedding= embedding_model
)


In [33]:
# Enable MMR in the retriver
retriver = vectorstore.as_retriever(
    search_type='mmr',
    search_kwargs={
        'k': 3,                   # k is the top results
        'lambda_mult': 1        # lambda_mult = relevence-diversity balance
    }
)

In [34]:
querry = 'How is AI used to manage hospitals more efficiently?'
results = retriver.invoke(querry)

In [35]:
for i, doc in enumerate(results):
  print(f'\n______ Result {i+1} ______')
  print(doc.page_content)


______ Result 1 ______
Hospitals are using AI for predictive analytics to forecast patient admissions, optimize bed occupancy, and manage resources efficiently. This predictive approach helps avoid overcrowding and ensures better planning for surgeries and emergency care.

______ Result 2 ______
Artificial Intelligence is revolutionizing medical imaging by enabling faster and more accurate diagnoses. Algorithms can detect anomalies in X-rays, CT scans, and MRIs with precision that rivals human radiologists. This not only reduces diagnostic errors but also speeds up patient treatment.

______ Result 3 ______
AI accelerates the drug discovery process by identifying potential compounds and predicting their effectiveness before clinical trials. Companies like DeepMind and Insilico Medicine use AI models to significantly reduce time and cost in pharmaceutical development.


# Multi-Querry Retriver

In [38]:
from langchain.vectorstores.faiss import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers.multi_query import MultiQueryRetriever

In [39]:
#  10 documents that are about different topics (AI, blockchain, Java, etc.) but written in a similar tone, structure, and length so they seem similar — making it ideal for testing MMR, as the retriever will need to disentangle similar-looking content across different topics.
docs = [
    Document(
        page_content="This technology enables secure, decentralized transactions without needing a central authority. By maintaining a shared ledger across multiple nodes, it enhances transparency and reduces fraud in digital systems.",
        metadata={"topic": "Blockchain"}
    ),
    Document(
        page_content="This concept creates an immersive digital environment where users interact using avatars. It blends virtual and augmented reality to provide new experiences in social interaction, gaming, and collaboration.",
        metadata={"topic": "Metaverse"}
    ),
    Document(
        page_content="This field of computer science focuses on building systems that can learn from data. These systems make decisions, detect patterns, and automate complex tasks in a range of real-world applications.",
        metadata={"topic": "Artificial Intelligence"}
    ),
    Document(
        page_content="This programming language is known for its portability, stability, and object-oriented features. It is widely used in enterprise software, Android apps, and backend systems across industries.",
        metadata={"topic": "Java"}
    ),
    Document(
        page_content="This global sport involves two teams competing to score by sending a ball into the opposing goal. It is celebrated worldwide, especially during major tournaments watched by millions.",
        metadata={"topic": "Football"}
    ),
    Document(
        page_content="This discipline focuses on protecting digital systems from unauthorized access and attacks. It employs encryption, firewalls, and authentication to safeguard data and maintain privacy.",
        metadata={"topic": "Cybersecurity"}
    ),
    Document(
        page_content="This environmental issue is caused by rising global temperatures due to human activity. It leads to extreme weather events, rising sea levels, and long-term changes in ecosystems.",
        metadata={"topic": "Climate Change"}
    ),
    Document(
        page_content="This scientific pursuit involves sending machines beyond Earth to collect data and explore unknown environments. It seeks to expand our knowledge of the universe and inspire innovation.",
        metadata={"topic": "Space Exploration"}
    ),
    Document(
        page_content="This computing paradigm leverages quantum mechanics to process information in novel ways. It has the potential to solve problems far beyond the capabilities of classical computers.",
        metadata={"topic": "Quantum Computing"}
    ),
    Document(
        page_content="This area of health focuses on emotional and psychological well-being. It includes managing stress, treating mental illnesses, and promoting open discussions about cognitive health.",
        metadata={"topic": "Mental Health"}
    )
]


In [40]:
# initalize embedding model
embedding_model = GoogleGenerativeAIEmbeddings(
    model = 'models/embedding-001',
    google_api_key = api_key
)

# create FAISS vector store
vectorstore = FAISS.from_documents(
    documents= docs,
    embedding= embedding_model
)


In [47]:
# creating similarity and multi-querry retriver

similarity_retriver = vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={
        'k': 5
    }
)

multi_querry_retriver = MultiQueryRetriever.from_llm(
    retriever= vectorstore.as_retriever(search_kwargs={'k':5}),
    llm= ChatGoogleGenerativeAI(
        model= 'gemini-2.0-flash',
        google_api_key= api_key
    ),
)

In [51]:
# vague Querry
query = "What are the major challenges and innovations happening in the digital world today?"

In [52]:
# Retriver Results
similarity_results = similarity_retriver.invoke(query)
multi_querry_results = multi_querry_retriver.invoke(query)

In [53]:
# Print comparison
print("Similarity Retriever Results:")
for i, doc in enumerate(similarity_results):
    print(f"\nDoc {i} ({doc.metadata['topic']}):\n{doc.page_content}")

print("\n" + "="*80 + "\n")

print("Multi-Query Retriever Results:")
for i, doc in enumerate(multi_querry_results):
    print(f"\nDoc {i} ({doc.metadata['topic']}):\n{doc.page_content}")

Similarity Retriever Results:

Doc 0 (Blockchain):
This technology enables secure, decentralized transactions without needing a central authority. By maintaining a shared ledger across multiple nodes, it enhances transparency and reduces fraud in digital systems.

Doc 1 (Cybersecurity):
This discipline focuses on protecting digital systems from unauthorized access and attacks. It employs encryption, firewalls, and authentication to safeguard data and maintain privacy.

Doc 2 (Artificial Intelligence):
This field of computer science focuses on building systems that can learn from data. These systems make decisions, detect patterns, and automate complex tasks in a range of real-world applications.

Doc 3 (Quantum Computing):
This computing paradigm leverages quantum mechanics to process information in novel ways. It has the potential to solve problems far beyond the capabilities of classical computers.

Doc 4 (Metaverse):
This concept creates an immersive digital environment where users 

# Contextual Compression Retriever

In [54]:
from langchain.vectorstores.faiss import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever

In [56]:

docs = [
    Document(
        page_content=(
            "The Eiffel Tower is located in Paris and is a symbol of France. "
            "It attracts millions of tourists every year. "
            "Python is a high-level programming language used in web development, data science, and automation."
        ),
        metadata={"source": "doc1"}
    ),
    Document(
        page_content=(
            "Neil Armstrong was the first human to walk on the moon in 1969. "
            "Basketball is a sport played by two teams of five players on a rectangular court. "
            "Space missions have contributed to many technological advancements."
        ),
        metadata={"source": "doc2"}
    ),
    Document(
        page_content=(
            "The Great Wall of China is a historic structure that spans over 13,000 miles. "
            "It was built for defense purposes. "
            "Machine learning is a subset of artificial intelligence that allows systems to learn from data."
        ),
        metadata={"source": "doc3"}
    ),
    Document(
        page_content=(
            "Photosynthesis occurs in plant cells using chlorophyll to convert sunlight into energy. "
            "The Amazon rainforest is known for its biodiversity and plays a role in climate regulation. "
            "Java is a widely used programming language in enterprise applications."
        ),
        metadata={"source": "doc4"}
    ),
    Document(
        page_content=(
            "Climate change is causing rising sea levels and more extreme weather patterns. "
            "In 1969, NASA’s Apollo 11 mission successfully landed humans on the moon. "
            "HTML and CSS are used for structuring and styling web pages."
        ),
        metadata={"source": "doc5"}
    ),
]


In [57]:
# creating a FAISS vector store
embedding_model = GoogleGenerativeAIEmbeddings(
    model = 'models/embedding-001',
    google_api_key = api_key
)

vectorstore = FAISS.from_documents(
    documents= docs,
    embedding= embedding_model
)

In [58]:
base_retriever = vectorstore.as_retriever(search_kwargs={'k': 3})

In [59]:
# setting up compressor using llm
compressor = LLMChainExtractor.from_llm(
    llm= ChatGoogleGenerativeAI(
        model= 'gemini-2.0-flash',
        google_api_key= api_key
    )
)

In [60]:
# create contextula compressor retriver
compression_retriever = ContextualCompressionRetriever(
    base_compressor= compressor,
    base_retriever= base_retriever
)


In [71]:
querry = "Tell me about the moon landing and space missions."
compressed_results = compression_retriever.invoke(querry)

In [72]:
for i, doc in enumerate(compressed_results):
  print(f'\n______ Result {i+1} ______')
  print(doc.page_content)


______ Result 1 ______
Neil Armstrong was the first human to walk on the moon in 1969. Space missions have contributed to many technological advancements.

______ Result 2 ______
In 1969, NASA’s Apollo 11 mission successfully landed humans on the moon.
