In [None]:
%pip install --quiet --upgrade langchain langchain-community langchain-chroma
%pip install -qU langchain-openai
%pip install jq

In [None]:
#Hack to get chroma to work in Github Codespaces
%pip install pysqlite3-binary

#Paste the below 3 lines in __init__.py of chromadb located /workspaces/codespaces-jupyter/.venv/lib/python3.12/site-packages/chromadb/__init__.py
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("open_ai_api_key")

os.environ["OPENAI_API_KEY"] = openai_api_key

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import JSONLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load, chunk and index the contents of the blog.
file_path='data/sampicaQuest_data.json'
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key='entry_parsed'
)

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant data in the docs
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Who is Fowler Merrybeard and give me some information about him.")