# Multi Query Retrievers

In [1]:
from langchain.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

loaders = [
    TextLoader("data/langchain.md"),
    TextLoader("data/langchain2.md"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OllamaEmbeddings(model='snowflake-arctic-embed:33m')
)

## Smaller chunks

In [2]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in docs]

In [3]:
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

sub_docs = []
for i, doc in enumerate(docs):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [4]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [5]:
retriever.vectorstore.similarity_search("LangChain")[0]

Document(metadata={'doc_id': '621845ba-bbca-4f97-a170-f4c2d87bcb97', 'source': 'data/langchain2.md'}, page_content='This and other tutorials are perhaps most conveniently run in a Jupyter notebook. See [here](https://jupyter.org/install) for instructions on how to install.\n\n### Installation[\u200b](https://python.langchain.com/docs/tutorials/summarization/#installation "Direct link to Installation")\n\nTo install LangChain run:\n\n*   Pip\n*   Conda')

In [6]:
len(retriever.invoke("LangChain")[0].page_content)



9931

In [7]:
from langchain.retrievers.multi_vector import SearchType

retriever.search_type = SearchType.mmr

len(retriever.invoke("LangChain")[0].page_content)

9931

## Associating summaries with a document for retrieval

In [8]:
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
# llm = ChatOpenAI(model="gpt-4o-mini")
llm= ChatOllama(model='llama3.2:1b')



In [9]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | llm
    | StrOutputParser()
)

In [10]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | llm
    | StrOutputParser()
)

In [11]:
summaries = chain.batch(docs, {"max_concurrency": 5})


In [12]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [13]:
# # We can also add the original chunks to the vectorstore if we so want
# for i, doc in enumerate(docs):
#     doc.metadata[id_key] = doc_ids[i]
# retriever.vectorstore.add_documents(docs)

In [None]:
sub_docs = retriever.vectorstore.similarity_search("LangChain")

print(sub_docs[0])

Document(metadata={'doc_id': 'ed4f36d0-1118-4dec-b0fc-549844c8418f'}, page_content='LangChain is an open-source AI development environment that provides a wide range of features and tools for building, training, and deploying machine learning models. It includes built-in support for natural language processing (NLP) tasks such as text classification, sentiment analysis, and document similarity.\n\nBelow are some key points about the code snippet provided:\n\n1.  **Retrieval using raw input query**: The original code snippet is using the `raw_input_query` to retrieve documents from a vector store.\n2.  **Query analysis**: The updated code snippet includes a line of code that adds metadata filters to the documents in the vector store based on their sections.\n\nHere\'s a more detailed breakdown of how this might work:\n\n*   When building an NLP model, you typically need to preprocess and transform your data into a format suitable for training the model. In LangChain, this is often done 

In [15]:
retrieved_docs = retriever.invoke("LangChain")

len(retrieved_docs[0].page_content)

9960