Importing all necessary packages

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain import hub

Setting OpenAI API Key

In [None]:
import constants
api_key = constants.openai_api_key

Loading the pdf document using PyPDFLoader from langchain

In [None]:
pdf_path = 'crime-and-punishment.pdf'
loader = PyPDFLoader(pdf_path, extract_images=False)

data = []
pages = loader.load()
data.extend(pages[1:])

# words = []
# for i in data: 
#   words.append(i.page_content)
# words = ' '.join(words)
# len(words)



Split the loaded document into chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=[".", ",", "\n", "\n \n", ";"],
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False
)
chunks = text_splitter.split_documents(data)
# len(chunks)


Using RefineDocumentsChain from langchain

In [None]:
# Initialize the LLM
llm = OpenAI(temperature=0.25, openai_api_key=api_key)

# Define the prompt template for the initial summary
prompt_template = """Write a concise summary of the following:
{text}
CONCISE SUMMARY:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

# Define the refinement prompt template
refine_template = (
    "Your job is to produce a final summary.\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "It's eventually going to be a precise 20-page summary of the complete book.\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary. There might be events narrated in the book, take care of that.\n"
    "If the context isn't of great use, do not add anything from it."
)

refine_prompt = PromptTemplate(template=refine_template, input_variables=["existing_answer", "text"])

# Load the summarize chain
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

# Run the chain with the provided chunks
result = chain({"input_documents": chunks})
output_text = result["output_text"]

Few main reasons for using langchain for this task:

Prompt Management and Optimization: Langchain offers tools for managing and optimizing prompts, which is crucial for generating accurate and relevant summaries.

Data Augmented Generation: This feature allows Langchain to fetch relevant data from external sources before generating summaries, ensuring that the output is both comprehensive and contextually accurate.

Chains and Agents: Langchain's architecture supports the creation of chains and agents, which can perform a series of LLM calls or actions based on the data retrieved, leading to more dynamic and intelligent summarization processes.