# Parent Child Retriever

In [3]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_core.stores import InMemoryStore
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

# load docs
text_dir = "./Korea info"
files = os.listdir(text_dir)
txt_files = [file for file in files if file.endswith(".txt")]

docs = []
print(files)
for file in txt_files:
    docs.extend(TextLoader(text_dir + "/" + file).load())

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
pc.create_index(
    name="terry-korea",
    dimension=1536, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
index = pc.Index("terry-korea")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

# create text splitter
child_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
)

# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectordb,
    docstore=store,
    child_splitter=child_splitter,
)
retriever.add_documents(docs, ids=None)

['Korea Transport.txt', 'Korean food.txt', 'Korean tourist destination.txt', 'Korea things todo.txt']


In [4]:
# retrieve chunk
from IPython.display import JSON

query = "What I can do in Korea ?"
sub_docs = vectordb.similarity_search(query)
print(sub_docs)

[Document(id='afa5bc6e-eee6-45e3-bd73-a74979ee73ff', metadata={'doc_id': '00d526b4-ca98-4e32-9ff6-e4ce6b6c33d1', 'source': './Korea info/Korea things todo.txt'}, page_content='18. Learn Korean Culture - Dive into cultural immersion by learning age old handicrafts like pottery making or folk crafts like knotting fabric at camps designed to teach traditional practices at locales like Korean Folk Villages with talented elder masters imparting insightful customs through these time treasured arts.'), Document(id='a24a570c-7cf6-4df9-b08f-2e0605cb3272', metadata={'doc_id': 'ca5b6891-3fdd-41ff-87f3-eeddb0045ee1', 'source': './Korea info/Korea Transport.txt'}, page_content="I'm happy to provide any other public transportation information to help visitors get around South Korea's amazing sites with ease and enjoyment! Let me know if you need any other recommendations or advice."), Document(id='850b0d55-48b0-448d-aa9f-38bdd783cdbf', metadata={'doc_id': '00d526b4-ca98-4e32-9ff6-e4ce6b6c33d1', 'sou

In [6]:
# Retrieve full document
retrieved_docs = retriever.invoke("query")
JSON(retrieved_docs)

<IPython.core.display.JSON object>

# Large chunk retriever

In [7]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_core.stores import InMemoryStore
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

# load docs
text_dir = "./Korea info"
files = os.listdir(text_dir)
txt_files = [file for file in files if file.endswith(".txt")]

docs = []
print(files)
for file in txt_files:
    docs.extend(TextLoader(text_dir + "/" + file).load())

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
index = pc.Index("terry-korea")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

# create text splitter
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
)

# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectordb,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs, ids=None)

['Korea Transport.txt', 'Korean food.txt', 'Korean tourist destination.txt', 'Korea things todo.txt']


In [8]:
query = "What I can do in Korea ?"
sub_docs = vectordb.similarity_search(query)
JSON(sub_docs)

<IPython.core.display.JSON object>

In [30]:
retrieved_docs = retriever.get_relevant_documents("query")
JSON(retrieved_docs)

<IPython.core.display.JSON object>

In [9]:
print(len(retrieved_docs[0].page_content))

14958
