# RAG

## Document Loader

In [None]:
# Install the langchain_community package
# %pip install langchain_community
# %pip install unstructured

In [None]:
# 1. PDF document loader

from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("paht/to/file/attention_is_all_you_need.pdf") # Load the PDF file
data = loader.load() # Load the PDF file into memory
print(data[0])
print(loader.text) # Print the text of the PDF file


In [None]:
# 2. CSV document loader

from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader("fifa_countries_audience.csv") # Load the PDF file

data = loader.load() # Load the PDF file into memory
print(data[0])

In [None]:
# 3. HTML document loader

from langchain_community.document_loaders import UnstructuredHTMLLoader
# loader = UnstructuredHTMLLoader("https://en.wikipedia.org/wiki/Deep_learning") # Load the PDF file
loader = UnstructuredHTMLLoader("white_house_executive_order_nov_2023") # Load the PDF file

data = loader.load() # Load the PDF file into memory
print(data[0]) # Print the first document

# Print the first document's metadata
print(data[0].metadata)

## Document Splitting

In [2]:
# quote
quote = '''One machine can do the work of fifty ordinary humans. \nBut no machine can do the work of one extraordinary human.'''
print(quote)

print(len(quote))

chunk_size = 24
chunk_overlap = 3

One machine can do the work of fifty ordinary humans. 
But no machine can do the work of one extraordinary human.
113


In [None]:
# Character TextSplitter to split documents
'''This method splits based on the separator first, then evaluates chunk_size and chunk_overlap to check if it's satisfied.'''
from langchain_text_splitters import CharacterTextSplitter

ct_splitter = CharacterTextSplitter(
    separator = ".", 
    chunk_size = chunk_size, 
    chunk_overlap = chunk_overlap
)

docs = ct_splitter.split_text(quote)
print(docs)
print([len(doc) for doc in docs])

Created a chunk of size 52, which is longer than the specified 24


['One machine can do the work of fifty ordinary humans', 'But no machine can do the work of one extraordinary human']
[52, 57]


we have a problem: each of these chunks contains more characters than our specified chunk_size. CharacterTextSplitter splits on the separator in an attempt to make chunks smaller than chunk_size, but in this case, splitting on the separator was unable to return chunks below our chunk_size.

In [None]:
# Recursive Character TextSplitter to split documents
'''splitting the document using each separator in turn, and seeing if these chunks can be combined while remaining under chunk_size.'''
from langchain_text_splitters import RecursiveCharacterTextSplitter

rc_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n,", "\n", " ", ""], 
    chunk_size = chunk_size, 
    chunk_overlap = chunk_overlap
)

docs = rc_splitter.split_text(quote)
print(docs)
print([len(doc) for doc in docs])

Created a chunk of size 52, which is longer than the specified 24


['One machine can do the work of fifty ordinary humans', 'But no machine can do the work of one extraordinary human']
[52, 57]


this recursive implementation may work better on larger documents.

In [None]:
# Recursive Character TextSplitter with HTML
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = UnstructuredHTMLLoader("white_house_executive_order_nov_2023.html") # Load the PDF file
data = loader.load() # Load the PDF file into memory

rc_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    separators = ["."]
)

docs = rc_splitter.split_documents(data)
print(docs)
print([len(doc) for doc in docs])

## Vector Store

In [None]:
# setting up a chroma vector database
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# create an embedding model from openai
embedding_function = OpenAIEmbeddings(api_key = 'openai_api_key', model = 'text-embedding-3-small')

# create a chroma vector store 
# to create a Chroma database from a set of documents, call the .from_documents() method on the Chroma class, passing the documents and embedding function to use.
vectorstore = Chroma.from_documents(
    docs, 
    embedding = embedding_function,
    persist_directory = "path/to/directory"
)

# integrate the database with other LangChain components
retriever = vectorstore.as_retriever(
    search_type = "similarity", # perform similarity search
    search_kwargs = {"k":2} # return the top 2 most similar documents
)

In [None]:
# Building a prompt template
from langchain_core.prompts import ChatPromptTemplate

message = '''
Review and fix the following TechStack marketing copy with the following guidelines in consideration:

Guidlines:
{guidelines}

Copy:
{copy}

Fixed Copy:
'''

prompt_template = ChatPromptTemplate.from_messages([("human", message)])

In [None]:
# chainging all together
from langchain_core.runnables import RunnablePassthrough

rag_chain = ({"guidelines": retriever, "copy": RunnablePassthrough} 
             | prompt_template 
             | llm)

rag_chain.invoke()

## Testing the chain

In [None]:
# preparing the documents and vector database
loader = PyPDFLoader('rag_vs_fine_tuning.pdf')
data = loader.load()

# Split the document using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n,", "\n", " ", ""], 
    chunk_size = 300, 
    chunk_overlap =50
)
docs = splitter.split_documents(data) 

# Embed the documents in a persistent Chroma vector database
embedding_function = OpenAIEmbeddings(api_key='<OPENAI_API_TOKEN>', model='text-embedding-3-small')
vectorstore = Chroma.from_documents(
    docs,
    embedding=embedding_function,
    persist_directory=os.getcwd()
)

# Configure the vector store as a retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)

In [None]:
# Creating a Prompt
# Add placeholders to the message string
message = """
Answer the following question using the context provided:

Context:
{context}

Question:
{question}

Answer:
"""

# Create a chat prompt template from the message string
prompt_template = ChatPromptTemplate.from_messages([("human", message)])

In [None]:
# creating a RAG chain
vectorstore = Chroma.from_documents(
    docs,
    embedding=OpenAIEmbeddings(api_key='<OPENAI_API_TOKEN>', model='text-embedding-3-small'),
    persist_directory=os.getcwd()
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# Create a chain to link retriever, prompt_template, and llm
rag_chain = ({"context": retriever, "question": RunnablePassthrough()}
            | prompt_template
            | llm)

# Invoke the chain
response = rag_chain.invoke("Which popular LLMs were considered in the paper?")
print(response.content)