In [None]:
import os
from bs4 import SoupStrainer, BeautifulSoup
import json
from langchain import hub
from langchain_chroma import Chroma
from langchain.docstore.document import Document
from langchain_community.document_loaders import WebBaseLoader,  WikipediaLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
load_dotenv()

print("imported")

In [None]:
with open('documents.json', 'r') as f:
    loaded_docs = json.load(f)

print(loaded_docs)
json_docs = []

for doc in loaded_docs:
    text = doc["content"]
    metadata = {"title": doc["title"]}
    json_docs.extend([Document(metadata=metadata, page_content=text)])

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=200, add_start_index=True)
json_documents = text_splitter.split_documents(json_docs)

print(len(json_documents))
print(len(json_documents[0].page_content))
print(json_documents[0].page_content)

In [None]:
class SingletonChroma:
    _instance = None
    
    @staticmethod
    def get_instance(documents=None):
        if SingletonChroma._instance is None:
            if documents is None:
                raise ValueError("Documents must be provided for the first initialization.")
            SingletonChroma(documents)
        return SingletonChroma._instance

    def __init__(self, documents):
        if SingletonChroma._instance is not None:
            raise Exception("This class is a singleton!")
        SingletonChroma._instance = Chroma.from_documents(
            documents=documents,
            embedding=OpenAIEmbeddings(model="text-embedding-ada-002")
        )

    
singleton_chroma = SingletonChroma.get_instance(documents=json_documents)
retriever = singleton_chroma.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [None]:
singleton_chroma = SingletonChroma.get_instance(documents=json_documents)
retriever = singleton_chroma.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [None]:
retrieved_docs = retriever.invoke("Anne Boleyn's relationship with Henry VIII")
print(retrieved_docs[0].metadata)
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].metadata)
print(retrieved_docs[1].page_content)
print(retrieved_docs[2].metadata)
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata)
print(retrieved_docs[3].page_content)
print(retrieved_docs[4].metadata)
print(retrieved_docs[4].page_content)
print(retrieved_docs[5].metadata)
print(retrieved_docs[5].page_content)

In [None]:
retrieved_docs = retriever.invoke("Who was Anne Boleyn in Six the musical?")
print(retrieved_docs[0].metadata)
print(retrieved_docs[0].page_content)
print(retrieved_docs[1].metadata)
print(retrieved_docs[1].page_content)
print(retrieved_docs[2].metadata)
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata)
print(retrieved_docs[3].page_content)
print(retrieved_docs[4].metadata)
print(retrieved_docs[4].page_content)
print(retrieved_docs[5].metadata)
print(retrieved_docs[5].page_content)

In [None]:
llm = ChatOpenAI(model_name="gpt-4o-mini")
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("Who was Anne Boleyn in the context of Six the Musical?")