In [None]:
"""
This script demonstrates how to create a local vector store using LangChain and FAISS.
It encodes IPL player profiles into embeddings using a Hugging Face model that runs on CPU.
The script stores these embeddings in a FAISS index, which is saved locally to disk.
It supports adding documents, saving and reloading the index, and performing similarity search.
Due to FAISS limitations, metadata filtering, updating, and deletion must be handled manually.
A simulation of document update is included by rebuilding the index with modified content.
The HuggingFace embedding model used is 'all-MiniLM-L6-v2', suitable for semantic similarity.
This example is useful for small RAG applications, search tools, or offline LLM pipelines.
It requires no internet access or external APIs, making it completely open-source and portable.
The script prints top search results and metadata after creation and update of the index.
"""

In [1]:
pip install langchain faiss-cpu sentence-transformers langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 k

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

In [3]:
# Step 1: Create Documents
docs = [
    Document(
        page_content="Virat Kohli is one of the most successful and consistent batsmen in IPL history. Known for his aggressive batting style and fitness, he has led the Royal Challengers Bangalore in multiple seasons.",
        metadata={"team": "Royal Challengers Bangalore"}
    ),
    Document(
        page_content="Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.",
        metadata={"team": "Mumbai Indians"}
    ),
    Document(
        page_content="MS Dhoni, famously known as Captain Cool, has led Chennai Super Kings to multiple IPL titles. His finishing skills, wicketkeeping, and leadership are legendary.",
        metadata={"team": "Chennai Super Kings"}
    ),
    Document(
        page_content="Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. Playing for Mumbai Indians, he is known for his yorkers and death-over expertise.",
        metadata={"team": "Mumbai Indians"}
    ),
    Document(
        page_content="Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. Representing Chennai Super Kings, his quick fielding and match-winning performances make him a key player.",
        metadata={"team": "Chennai Super Kings"}
    ),
]

In [4]:
# Step 2: Initialize HuggingFace Embeddings (CPU)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Step 3: Create FAISS vector store
vector_store = FAISS.from_documents(docs, embedding=embedding_model)

In [6]:
# Step 4: Save the FAISS index locally
vector_store.save_local("faiss_ipl_index")

In [10]:
# Step 5: Perform similarity search
print("\nTop 2 similar documents for query: 'Who among these are a bowler?'")
results = vector_store.similarity_search(query="Who among these are a bowler?", k=2)
for i, res in enumerate(results, 1):
    print(f"\n--- Result {i} ---\n{res.page_content}\nMetadata: {res.metadata}")


Top 2 similar documents for query: 'Who among these are a bowler?'

--- Result 1 ---
Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. Playing for Mumbai Indians, he is known for his yorkers and death-over expertise.
Metadata: {'team': 'Mumbai Indians'}

--- Result 2 ---
Rohit Sharma is the most successful captain in IPL history, leading Mumbai Indians to five titles. He's known for his calm demeanor and ability to play big innings under pressure.
Metadata: {'team': 'Mumbai Indians'}


In [11]:
# NOTE:
# FAISS does not support metadata filtering or document updates/deletions by ID.
# To update or delete, you must reload the full index with modified docs.

# Step 6: Manual "update" simulation by recreating the index with the updated document
updated_docs = docs.copy()
updated_docs[0] = Document(
    page_content="Virat Kohli, the former captain of Royal Challengers Bangalore (RCB), is renowned for his aggressive leadership and consistent batting performances. He holds the record for the most runs in IPL history, including multiple centuries in a single season. Despite RCB not winning an IPL title under his captaincy, Kohli's passion and fitness set a benchmark for the league. His ability to chase targets and anchor innings has made him one of the most dependable players in T20 cricket.",
    metadata={"team": "Royal Challengers Bangalore"}
)

In [12]:
# Rebuild FAISS index with updated document
vector_store = FAISS.from_documents(updated_docs, embedding=embedding_model)

In [13]:
# Step 7: Save updated index
vector_store.save_local("faiss_ipl_index")

In [14]:
# Step 8: Display final index
print("\nFinal Document Set After Update:")
results = vector_store.similarity_search("Royal Challengers Bangalore", k=2)
for i, res in enumerate(results, 1):
    print(f"\n--- Doc {i} ---\n{res.page_content}\nMetadata: {res.metadata}")


Final Document Set After Update:

--- Doc 1 ---
Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. Representing Chennai Super Kings, his quick fielding and match-winning performances make him a key player.
Metadata: {'team': 'Chennai Super Kings'}

--- Doc 2 ---
Virat Kohli, the former captain of Royal Challengers Bangalore (RCB), is renowned for his aggressive leadership and consistent batting performances. He holds the record for the most runs in IPL history, including multiple centuries in a single season. Despite RCB not winning an IPL title under his captaincy, Kohli's passion and fitness set a benchmark for the league. His ability to chase targets and anchor innings has made him one of the most dependable players in T20 cricket.
Metadata: {'team': 'Royal Challengers Bangalore'}
