# RAG Langchain PDF Example - Council Financial Plans
This example loads one or more PDF documents, splits the contents into chunks, loads these into a vector store, then uses a retriever to 
ask natural langauge questions.

This uses langchain, a popular package to chain all these opeartions together, and to use an underlying Generative AI model, in this case OpenAI.

In [None]:
import os

# Load the .env file.  This allows us to use environment variables in the .env file
from dotenv import load_dotenv
load_dotenv() # load the .env file

In [3]:
import textwrap

These are the imports that are required for the langchain to work

In [None]:
from langchain import hub
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


Get the PDF files(s) that we will want to analyse.

In [None]:
pdf_dir = os.path.abspath("./pdf/")
pdf_dir

Load all PDFs in the directory

In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

directory_loader = DirectoryLoader(pdf_dir, glob="**/*.pdf", loader_cls=PyPDFLoader)
documents = directory_loader.load()
documents[:3]

Use the ChatOpenAI class to create a language model

In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini")
llm

Split the text into smaller chunks based on sentences or characters

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
chunks[:3]

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# splits = text_splitter.split_documents(documents)
vectorstore = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings())
vectorstore

Create the RAG chain

In [None]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain

In [14]:
#  Some example prompts
test_prompt1 = "Summarise Middlesborough council financial plan in two paragraphs?"
test_prompt2 = "In what ways are the financial plans of Middlesborough council and Derbyshire Dales similar?"

In [None]:
response = rag_chain.invoke(test_prompt2)
wrapped_response = textwrap.fill(response, width=120)
print(wrapped_response)

Some old code below here - ignore

This code is useful to load a single PDF.  However we are loading alll PDF files in a folder, so this is commenedt out.

In [None]:
#from langchain.document_loaders import PyPDFLoader
# pdf_file = pdf_dir + "/DAX Resources.pdf"
# pdf_file
# # Load a PDF file
# loader = PyPDFLoader(pdf_file)

# # Load pages of the document into text chunks
# documents = loader.load()

# documents

In [None]:
# from langchain.chains.summarize import load_summarize_chain
# from langchain.llms import OpenAI

# # Load an OpenAI LLM
# llm = OpenAI(model="gpt-4", api_key=os.getenv("OPENAI_API_KEY"))

# # Use a summarization chain
# summarize_chain = load_summarize_chain(llm)

# # Summarize the chunks of the document
# summaries = summarize_chain.run(chunks)
