### Vector Store Retriever

In [None]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS

# Created docs
docs = [
    Document(page_content="Python is a programming language", metadata={"id": 1}),
    Document(page_content="Java is also a programming language", metadata={"id": 2}),
    Document(page_content="Cats sleeps often during the day", metadata={"id": 3}),
    Document(page_content="Dogs barks at strangers", metadata={"id": 4}),
    Document(page_content="Birds fly at the sky", metadata={"id": 5}),
]

# Embedding docs
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Storing in the vector store
vector_store = FAISS.from_documents(docs, embeddings)

# Create retriever
# retriever = vector_store.as_retriever(search_kwargs={"k": 2})

# Query
query = "Which animals are domestic pets?"

# out_docs = retriever.invoke(query)
out_docs = vector_store.similarity_search(query, k=2)

print("Retriever results:")

for doc in out_docs:
    print(f"ID: {doc.metadata.get('id')}, Content: {doc.page_content}")


##### Using mmr

In [None]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS

# Created docs
docs = [
    Document(page_content="Python is a programming language", metadata={"id": 1}),
    Document(page_content="Java is also a programming language", metadata={"id": 2}),
    Document(page_content="Cats sleeps often during the day", metadata={"id": 3}),
    Document(page_content="Dogs barks at strangers", metadata={"id": 4}),
    Document(page_content="Birds fly at the sky", metadata={"id": 5}),
]

# Embedding docs
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Storing in the vector store
vector_store = FAISS.from_documents(docs, embeddings)

# Create retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 2, "fetch_k": 5, "lambda_mult": 0.5})

# Query
query = "Which animals are domestic pets?"
out_docs = retriever.invoke(query)

print("Retriever results:")

for doc in out_docs:
    print(f"ID: {doc.metadata.get('id')}, Content: {doc.page_content}")


### BM 25 Retriever

In [None]:
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever

# Created docs
docs = [
    Document(page_content="The sky is blue during the day", metadata={"id": 1}), # sky, blue
    Document(page_content="At night stars light up the sky", metadata={"id": 2}), # sky
    Document(page_content="Blue whales are the largest animal on Earth", metadata={"id": 3}), # blue, animal
    Document(page_content="Birds can fly high up in the sky", metadata={"id": 4}), # sky
    Document(page_content="Deep sea creatures live in the darkness", metadata={"id": 5}), # no matching
]

# Build BM25 retriever
bm_25_retriever = BM25Retriever.from_documents(docs, k=3)

# Run query
query = "sky blue animal"
out_docs = bm_25_retriever.invoke(query)

print("Retriever results:")

for doc in out_docs:
    print(f"ID: {doc.metadata.get('id')}, Content: {doc.page_content}")

### Ensemble / Hybrid Retriever

In [None]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever

# Created docs
docs = [
    Document(page_content="Python is a programming language", metadata={"id": 1}),
    Document(page_content="Java is also a programming language", metadata={"id": 2}),
    Document(page_content="Cats sleeps often during the day", metadata={"id": 3}),
    Document(page_content="Dogs barks at strangers", metadata={"id": 4}),
    Document(page_content="Birds fly at the sky", metadata={"id": 5}),
]

# Embedding docs
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Storing in the vector store
vector_store = FAISS.from_documents(docs, embeddings)

# Create vector retriever
vector_retriever = vector_store.as_retriever(search_kwargs={"k": 2})

# Create BM25 retriever
bm25_retriever = BM25Retriever.from_documents(docs, k=2)

# Create ensemble retriever combining both
# weights: [BM25 weight, Vector weight] - must sum to 1.0
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.5, 0.5]  # Equal weights for both retrievers
)

# Query
query = "Which animals are domestic pets?"
out_docs = ensemble_retriever.invoke(query)

print("Ensemble Retriever results:")
for doc in out_docs:
    print(f"ID: {doc.metadata.get('id')}, Content: {doc.page_content}")

### Contextual Compression Retriever

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [5]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.retrievers import ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import EmbeddingsFilter

# Created docs
docs = [
    Document(page_content="Python is a programming language used for web development and data science", metadata={"id": 1}),
    Document(page_content="Javascript runs in the browser, useful for interactive websites", metadata={"id": 2}),
    Document(page_content="C+ is a programming language used for system programming and games", metadata={"id": 3}),
    Document(page_content="Rust aims to be fast and secure, used in system software", metadata={"id": 4}),
    Document(page_content="Cooking a recipe often receives precise measurements and timing", metadata={"id": 5}),
]

embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
vs = FAISS.from_documents(docs, embedding=embeddings)
base_retriever = vs.as_retriever(search_kwargs={"k": 3})

compressor = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.5)

cc_retriever = ContextualCompressionRetriever(
    base_retriever=base_retriever,
    base_compressor=compressor
)

query = "Which programming languages are used for web development?"

out_docs = cc_retriever.invoke(query)

pretty_print_docs(out_docs)

Document 1:

Python is a programming language used for web development and data science
----------------------------------------------------------------------------------------------------
Document 2:

C+ is a programming language used for system programming and games
----------------------------------------------------------------------------------------------------
Document 3:

Javascript runs in the browser, useful for interactive websites


### MultiQuery Retriever

In [7]:
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

# Created docs
docs = [
    Document(page_content="Python is a programming language used for web development and data science", metadata={"id": 1}),
    Document(page_content="Javascript runs in the browser, useful for interactive websites", metadata={"id": 2}),
    Document(page_content="C+ is a programming language used for system programming and games", metadata={"id": 3}),
    Document(page_content="Rust aims to be fast and secure, used in system software", metadata={"id": 4}),
    Document(page_content="Cooking a recipe often receives precise measurements and timing", metadata={"id": 5}),
]

embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")
vs = FAISS.from_documents(docs, embedding=embeddings)
base_retriever = vs.as_retriever(search_kwargs={"k": 3})

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.7)

multi_query = MultiQueryRetriever.from_llm(
    llm=llm, 
    retriever=base_retriever, 
    include_original=True
)

query = "Which programming languages are used for web development?"

out_docs = multi_query.invoke(query)

pretty_print_docs(out_docs)

Document 1:

Python is a programming language used for web development and data science
----------------------------------------------------------------------------------------------------
Document 2:

Javascript runs in the browser, useful for interactive websites
----------------------------------------------------------------------------------------------------
Document 3:

C+ is a programming language used for system programming and games
