In [23]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

True

In [24]:
file_path = r'Pesreport.pdf'

if not os.path.exists(file_path):
    print(f"Error: The file '{file_path}' does not exist.")
elif not os.path.isfile(file_path):
    print(f"Error: '{file_path}' is not a file (it might be a directory or something else).")
else:
    print(f"Success: The file '{file_path}' exists and is a file.")

if os.path.isfile(file_path):
    from langchain_community.document_loaders import PyPDFLoader
    loader = PyPDFLoader(file_path)
    document = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(document)
    print(f"Number of chunks: {len(final_documents)}")

Success: The file 'Pesreport.pdf' exists and is a file.
Number of chunks: 219


In [None]:
len(final_documents)

219

In [26]:
### Embedding
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)



In [27]:
import numpy as np
np.array(embedding.embed_query(final_documents[0].page_content))


array([-2.62791421e-02, -4.86245006e-02, -4.91976850e-02,  2.55007558e-02,
       -8.38903412e-02,  8.51731822e-02, -3.87536027e-02, -2.17713192e-02,
       -1.25799730e-01, -2.12396309e-02,  2.53782086e-02, -1.18406355e-01,
        2.70975325e-02, -6.42396584e-02, -3.52003947e-02, -1.40098296e-02,
        1.80908665e-02,  4.38999990e-03,  1.05664141e-01, -6.51316792e-02,
        1.26087619e-02,  7.73497969e-02,  7.64155164e-02, -5.30320872e-03,
       -1.28276693e-02, -3.54953222e-02,  2.87768133e-02,  5.75752556e-03,
       -1.78227518e-02, -1.71144437e-02,  3.02665997e-02,  5.83838522e-02,
        7.05711320e-02,  6.33044615e-02,  6.22752607e-02, -7.40776360e-02,
       -1.55478474e-02,  7.71644786e-02,  8.65241587e-02, -3.78231890e-02,
       -2.84527410e-02, -5.93127757e-02,  1.69618744e-02, -3.17490175e-02,
        2.17909012e-02,  5.08087203e-02, -7.25434395e-04, -4.49001091e-03,
       -2.66485307e-02,  7.63845146e-02,  1.25481386e-03,  7.02266246e-02,
       -1.11619104e-02,  

In [28]:
vectorstore = FAISS.from_documents(
    documents=final_documents[:120],
    embedding=embedding
)

In [34]:
query = "WHAT IS PES REPORT?"
retriever = vectorstore.similarity_search(query, k=3)
print(retriever[0].page_content)

CHAPTER-1 
 
INTRODUCTION 
 
The data collected through any field inquiry is subjected to certain amount of error, that 
normally creeps in due to the error committed by the investigator or the respondent. A massive 
operation like the population census is no exception, where some amount of error is inevitable  
considering the fact that a large number of enumerators and supervisors are engaged in the 
collection of data,  in spite of the best of the intentions and efforts to collect the accurate data. 
Post Enumeration Survey (PES) is a sample survey conducted immediately after the census in 
order to assess the coverage and quality of the census enumeration. A large number of countries 
carry out a Post Enumeration Survey (PES) after the completion of the census to s cientifically 
measure the degree of accuracy. The Post Enumeration Check (PEC), as it used to be called 
earlier and renamed as Post Enumeration Survey (PES) in the 2001 Census, has become an


In [35]:
from langchain_groq import ChatGroq
from langchain.agents import initialize_agent, AgentType

# Step 1: Load Groq LLM
llm = ChatGroq(
    model="gemma2-9b-it", 
    temperature=0.1,
)

response = llm.invoke("What is PES report?")
print(response)


content="PES stands for **Physical Education and Sports**. \n\nA PES report is likely a report card or assessment document related to a student's performance in Physical Education and Sports classes. \n\nHere's what it might include:\n\n* **Academic Performance:** Grades or marks on tests, quizzes, and assignments related to the theoretical aspects of PE, like anatomy, physiology, or sports rules.\n* **Physical Skills:** Evaluation of the student's abilities in various physical activities, such as running, jumping, throwing, catching, and team sports.\n* **Fitness Levels:** Assessment of the student's cardiovascular endurance, muscular strength, flexibility, and body composition.\n* **Sportsmanship:** Evaluation of the student's behavior and attitude towards themselves, teammates, opponents, and officials.\n* **Participation:**  Record of the student's attendance and engagement in class activities and sports events.\n\nThe specific content and format of a PES report will vary depending

In [36]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [37]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [39]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [42]:
query="""Net Difference Rate (NDR) and the Index of Inconsistency for persons enumerated by
age groups"""

In [43]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])

The provided text states:

"The Net Difference Rate (NDR) and the Index of Inconsistency for persons enumerated by age groups and residence are given in Statement 4.2." 

Therefore, the answer to your question can be found in **Statement 4.2**. 



