# Retrievers

In [7]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

In [4]:
loaders = [
    TextLoader("data/sample.html"),
    TextLoader("data/sample.md"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [8]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OllamaEmbeddings(model="snowflake-arctic-embed:33m")
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [9]:
retriever.add_documents(docs, ids=None)

In [10]:
list(store.yield_keys())


['752f8db9-de94-4a25-b593-88a9af801b6a',
 '3d7854bf-2545-441c-9426-934c1b225352']

In [11]:
sub_docs = vectorstore.similarity_search("justice breyer")


In [12]:
print(sub_docs[0].page_content)



#main-header ul {
            list-style: none;
            padding: 0;
        }

        #main-header li {
            display: inline;
            margin: 0 10px;
        }


In [14]:
retrieved_docs = retriever.invoke("justice breyer")
len(retrieved_docs[0].page_content)



4662

## Retrieving larger chunks

In [16]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OllamaEmbeddings(model="snowflake-arctic-embed:33m")
)
# The storage layer for the parent documents
store = InMemoryStore()

In [17]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [19]:
retriever.add_documents(docs)
len(list(store.yield_keys()))




8

In [20]:
sub_docs = vectorstore.similarity_search("justice breyer")

print(sub_docs[0].page_content)



#main-header ul {
            list-style: none;
            padding: 0;
        }

        #main-header li {
            display: inline;
            margin: 0 10px;
        }


In [22]:
retrieved_docs = retriever.invoke("justice breyer")

len(retrieved_docs[0].page_content)
print(retrieved_docs[0].page_content)




<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>HTML Element Selection Example</title>
    <style>
        /* General Styles */
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            line-height: 1.6;
            color: #333;
            background-color: #f9f9f9;
        }

        h1, h2 {
            color: #2c3e50;
        }

        p {
            margin: 10px 0;
        }

        a {
            color: #3498db;
            text-decoration: none;
        }

        a:hover {
            text-decoration: underline;
        }

        /* Header */
        #main-header {
            background-color: #b7c2ce;
            color: white;
            padding: 20px;
            text-align: center;
            border-radius: 5px;
        }

        #main-header ul {
            list-style: none;
            padding: 0;
        }

        #main