In [None]:
"""
This script demonstrates a fully open-source, CPU-compatible document retrieval system.
It uses LangChain with FAISS as the vector store and Hugging Face's MiniLM model for embeddings.
A collection of health and general knowledge documents is defined and embedded locally.
The embeddings are stored in FAISS to enable efficient vector similarity search.
The script supports two retrieval modes: single-query and simulated multi-query retrieval.
Multiple rephrased versions of a question are used to mimic a MultiQueryRetriever without an LLM.
This enhances the diversity and coverage of results for complex queries.
Only CPU is used—no API keys or cloud services are required.
Search results for both similarity and multi-query modes are printed to the console.
The setup is ideal for building offline semantic search or question-answering systems.
"""

In [1]:
!pip install langchain langchain-community faiss-cpu sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 k

In [2]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
# Step 1: Define relevant health & wellness documents
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]

In [4]:
# Step 2: Use Hugging Face CPU-based embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Step 3: Create FAISS vector store
vectorstore = FAISS.from_documents(documents=all_docs, embedding=embedding_model)

In [6]:
# Step 4: Create a similarity retriever (top 5)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [7]:
# Step 5: Define multiple rephrased queries (simulate multi-query retrieval)
multi_queries = [
    "How to boost your energy naturally?",
    "Ways to maintain balance in life?",
    "Tips for increasing energy levels daily?"
]

In [8]:
# Step 6: Retrieve results for the main query
query = "How to improve energy levels and maintain balance?"
similarity_results = retriever.invoke(query)

In [9]:
# Step 7: Retrieve and combine results from all multi-queries
multiquery_results = []
seen_docs = set()

In [10]:
for q in multi_queries:
    results = retriever.invoke(q)
    for doc in results:
        if doc.page_content not in seen_docs:
            multiquery_results.append(doc)
            seen_docs.add(doc.page_content)

In [11]:
# Step 8: Print similarity results
print("\n### Similarity Retriever Results ###")
for i, doc in enumerate(similarity_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


### Similarity Retriever Results ###

--- Result 1 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 2 ---
The solar energy system in modern homes helps balance electricity demand.

--- Result 3 ---
Consuming leafy greens and fruits helps detox the body and improve longevity.

--- Result 4 ---
Mindfulness and controlled breathing lower cortisol and improve mental clarity.

--- Result 5 ---
Photosynthesis enables plants to produce energy by converting sunlight.


In [12]:
# Separator
print("\n" + "*" * 150 + "\n")


******************************************************************************************************************************************************



In [13]:
# Step 9: Print multi-query results
print("### Multi-Query Retriever (Simulated) Results ###")
for i, doc in enumerate(multiquery_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)

### Multi-Query Retriever (Simulated) Results ###

--- Result 1 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 2 ---
Photosynthesis enables plants to produce energy by converting sunlight.

--- Result 3 ---
Consuming leafy greens and fruits helps detox the body and improve longevity.

--- Result 4 ---
Mindfulness and controlled breathing lower cortisol and improve mental clarity.

--- Result 5 ---
Regular walking boosts heart health and can reduce symptoms of depression.

--- Result 6 ---
The solar energy system in modern homes helps balance electricity demand.
