In [None]:
"""
This script demonstrates a fully offline, open-source contextual document retriever using FAISS.
It embeds a list of sample documents using Hugging Face's MiniLM model (CPU-based).
FAISS is used to store and retrieve similar documents based on a user query.
The script simulates contextual compression by filtering document content using keyword matching.
A simple `keyword_compressor` function extracts only relevant sentences from retrieved docs.
The goal is to mimic LLM-powered compression without using external APIs or cloud models.
It retrieves the top 5 relevant documents and then compresses them using predefined keywords.
The final results display only lines that mention concepts related to "photosynthesis."
This approach is suitable for lightweight applications where LLM usage is not feasible.
All components run locally on CPU with no external dependencies or API keys.
"""

In [1]:
!pip install langchain langchain-community faiss-cpu sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 k

In [2]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
# Simulated text compression (very basic keyword filter)
def keyword_compressor(documents, keywords):
    compressed_docs = []
    for doc in documents:
        matched_lines = [
            line.strip()
            for line in doc.page_content.split('\n')
            if any(kw.lower() in line.lower() for kw in keywords)
        ]
        if matched_lines:
            compressed_docs.append(Document(
                page_content="\n".join(matched_lines),
                metadata=doc.metadata
            ))
    return compressed_docs

In [4]:
# Step 1: Define documents
docs = [
    Document(page_content=(
        """The Grand Canyon is one of the most visited natural wonders in the world.
        Photosynthesis is the process by which green plants convert sunlight into energy.
        Millions of tourists travel to see it every year. The rocks date back millions of years."""
    ), metadata={"source": "Doc1"}),

    Document(page_content=(
        """In medieval Europe, castles were built primarily for defense.
        The chlorophyll in plant cells captures sunlight during photosynthesis.
        Knights wore armor made of metal. Siege weapons were often used to breach castle walls."""
    ), metadata={"source": "Doc2"}),

    Document(page_content=(
        """Basketball was invented by Dr. James Naismith in the late 19th century.
        It was originally played with a soccer ball and peach baskets. NBA is now a global league."""
    ), metadata={"source": "Doc3"}),

    Document(page_content=(
        """The history of cinema began in the late 1800s. Silent films were the earliest form.
        Thomas Edison was among the pioneers. Photosynthesis does not occur in animal cells.
        Modern filmmaking involves complex CGI and sound design."""
    ), metadata={"source": "Doc4"})
]

In [5]:
# Step 2: Use HuggingFaceEmbeddings (local, CPU)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Step 3: Create FAISS vector store
vectorstore = FAISS.from_documents(docs, embedding_model)

In [7]:
# Step 4: Create base retriever
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [8]:
# Step 5: Simulate contextual compression (filter by keywords)
query = "What is photosynthesis?"
keywords = ["photosynthesis", "chlorophyll", "sunlight", "plant"]

In [9]:
retrieved_docs = base_retriever.invoke(query)
compressed_results = keyword_compressor(retrieved_docs, keywords)

In [10]:
# Step 6: Show compressed results
for i, doc in enumerate(compressed_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
Photosynthesis is the process by which green plants convert sunlight into energy.

--- Result 2 ---
The chlorophyll in plant cells captures sunlight during photosynthesis.

--- Result 3 ---
Thomas Edison was among the pioneers. Photosynthesis does not occur in animal cells.
