### Data Ingestion

In [54]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import tiktoken
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [55]:
load_dotenv()

True

In [4]:
file_path = os.path.join(os.getcwd(),"data","sample.pdf")

In [6]:
loader = PyPDFLoader(file_path=file_path)

In [7]:
documents = loader.load()

In [10]:
len(documents) # no. of pages, only extracts text from the file

77

In [11]:
documents[:10]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'd:\\LLMOps_projects\\Document-Portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 0, 'page_label': '1'}, page_content='Llama 2: Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗ Louis Martin† Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Ko

In [12]:
### This is an experimental value
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len
)

In [13]:
docs = text_splitter.split_documents(documents=documents)

In [14]:
len(docs)

765

In [15]:
docs[0].page_content

'Llama 2: Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗ Louis Martin† Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev'

In [16]:
docs[0].metadata

{'producer': 'pdfTeX-1.40.25',
 'creator': 'LaTeX with hyperref',
 'creationdate': '2023-07-20T00:30:36+00:00',
 'author': '',
 'keywords': '',
 'moddate': '2023-07-20T00:30:36+00:00',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'subject': '',
 'title': '',
 'trapped': '/False',
 'source': 'd:\\LLMOps_projects\\Document-Portal\\notebook\\data\\sample.pdf',
 'total_pages': 77,
 'page': 0,
 'page_label': '1'}

In [29]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [30]:
vector_store = FAISS.from_documents(documents=docs,embedding=embedding_model)

In [39]:
### Retrieval Process
relevant_doc = vector_store.similarity_search("What is the abstract of this paper?", k = 6)

In [41]:
relevant_doc

[Document(id='4918ac5b-cb87-47dc-a083-25f61cb11127', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'd:\\LLMOps_projects\\Document-Portal\\notebook\\data\\sample.pdf', 'total_pages': 77, 'page': 39, 'page_label': '40'}, page_content='Dawn Song. The false promise of imitating proprietary llms.arXiv preprint arXiv:2305.15717, 2023.\nUditGupta,MariamElgamal,GageHills,Gu-YeonWei,Hsien-HsinSLee,DavidBrooks,andCarole-JeanWu.\nAct: designing sustainable computer systems with an architectural carbon modeling tool. InProceedings of\nthe 49th Annual International Symposium on Computer Architecture, pages 784–799, 2022a.\nUdit Gupta, Young Guen Kim, Sylvia Lee, Jordan Tse, Hsien-Hsin Sean Lee

In [None]:
retriever = vector_store.as_retriever(search_kwargs={k:})

In [None]:
retriever.invoke()

In [43]:
prompt_template = """
        Answer the question based on the context provided below. 
        If the context does not contain sufficient information, respond with: 
        "I do not have enough information about this."

        Context: {context}

        Question: {question}

        Answer:"""

In [45]:
prompt=PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [46]:
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n        Answer the question based on the context provided below. \n        If the context does not contain sufficient information, respond with: \n        "I do not have enough information about this."\n\n        Context: {context}\n\n        Question: {question}\n\n        Answer:')

In [48]:
parser=StrOutputParser()

In [49]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [56]:
llm=ChatGroq(model="deepseek-r1-distill-llama-70b")

In [57]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [58]:
rag_chain.invoke("can you provide a summary of this paper?")

"<think>\nOkay, so I need to figure out how to answer the question based on the provided context. The question is asking for a summary of the paper. Let me read through the context carefully.\n\nThe context starts with a citation by Thomas Scialom and others about a paper on abstractive summarization using discriminative adversarial search. Then there's a paragraph mentioning that the paper contributes a thorough description of their fine-tuning methodology for improving LLM safety. They hope this openness will help the community reproduce and improve these models, leading to more responsible development. They also share new observations from developing Llama 2 and Llama 2-Chat, like tool usage and knowledge organization.\n\nThen there are more citations from other authors, but those seem unrelated to the main paper in question. The last part lists acknowledgments, which probably aren't part of the main content.\n\nSo, the main paper discussed in the context is about their approach to 