In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## PDF Query Using Langchain

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken



In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"
#os.environ["SERPAPI_API_KEY"] = ""

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('/content/rag-knowledge-doc.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

"# About Pan Card \n \n### What is Pan card? \n \nThe PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax D\nepartment of India to track the tax-related transactions of individuals and entities. The PAN card is manda\ntory for any financial transaction in India, including opening a bank account, buying or selling property, an\nd filing income tax returns. \n \n### Who needs a Pan card? \n \nAll individuals/non-individuals (including foreign citizens/entities) earning taxable income in India must hav\ne a PAN card. \n \n### Types of PAN cards \n \nIn India, two types of PAN cards are available: e-PAN card and physical PAN card. \n \n1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the sa\nme PAN details as a physical PAN card but is available in a digital format. It can be downloaded online an\nd used as a valid identification document for various purposes. The e-PAN card is usually issued

In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap  = 20,
    length_function = len,
)
texts_char = text_splitter.split_text(raw_text)



In [None]:
len(texts_char)

119

In [None]:
# Download embeddings from OpenAI

embeddings = OpenAIEmbeddings()

query_result = embeddings.embed_query("Hello world")
len(query_result)

1536

In [None]:
document_search = FAISS.from_texts(texts_char, embeddings)

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "What are the documents required to apply for the new PAN?"
docs = document_search.similarity_search(query, k=3 )
chain.run(input_documents=docs, question=query)

' The documents required to apply for a new PAN card are a copy of the existing PAN card, passport, PIO card issued by the Government of India, and OCI card issued by the Government of India.'

In [None]:
print(len(docs))
print(docs)


3
[Document(page_content='### Documents required to update the details on PAN Card \n \nTo update the information on the PAN card, kindly keep these documents ready. \n \n- Copy of Existing Pan card', metadata={}), Document(page_content='Here are the necessary documents that are supposed to be submitted along with PAN Card Form 49AA \n \n1. Passport \n2. PIO card issued by Government of India \n3. OCI card issued by Government of India', metadata={}), Document(page_content='can apply for a reprint through ABC. We will guide you through the process and help you obtain a new co\npy of your PAN card. \n \n### Documents required for reprinting the lost PAN card', metadata={})]


In [None]:
query = "What is the cost/fees of a PAN card?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The cost/fees of a PAN card depends on whether you are requesting an e-PAN or physical PAN card. The charges for an e-PAN are INR 2500 and for a physical PAN card are INR 3700. Additionally, there is a cost of INR 2000 to link PAN with Aadhaar. Delivery of a physical PAN card will cost an additional Rs 1200.'

In [276]:
questions_df = pd.read_excel('/content/SampleQuestions.xlsx')
questions_list = questions_df['Question'].tolist()
g_ans = questions_df['Ideal Answer'].tolist()  #the gold (ideal) answers

In [None]:
import time


In [None]:
answers_qa = []
count = 0
for question in questions_list:
    docs = document_search.similarity_search(query)
    answer = chain.run(input_documents=docs, question=question)
    count+=1
    answers_qa.append(answer)
    time.sleep(30)
    print(count)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


In [None]:
answers_qa

[' To apply for a new PAN card, you need to submit a valid identity proof, address proof, and a photograph.',
 ' The cost/fees for a PAN card is INR 2500 for an e-PAN Card and INR 3700 for a physical PAN Card.',
 ' Yes, you can take the delivery of PAN Card at your Indian address, it will cost Rs 1200 extra for physical delivery to your address.',
 ' It usually takes around 10-15 working days to receive the PAN card after applying.',
 ' To apply for a PAN card, you can either go to the official website of the Income Tax Department of India and fill out the PAN card application form or you can go to an authorized PAN card agent for assistance. You will need to provide documents to prove your identity and address, and pay the applicable fee for the PAN card. The fee is INR 2500 for e-PAN and INR 3700 for physical PAN card.',
 ' The process to apply for a PAN Card is to fill up an application form, submit necessary documents, pay the applicable fee and wait for the PAN Card to be issued. 

### Method 2:

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("/content/rag-knowledge.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
texts_rec = text_splitter.split_documents(documents)

In [None]:
print(texts_rec[0])
print(documents[0])
print(len(documents))
print(len(texts_rec))

page_content='# About Pan Card\n\n### What is Pan card?' metadata={'source': '/content/rag-knowledge.txt'}
page_content="# About Pan Card\n\n### What is Pan card?\n\nThe PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.\n\n### Who needs a Pan card?\n\nAll individuals/non-individuals (including foreign citizens/entities) earning taxable income in India\xa0must have a PAN card.\n\n### Types of PAN cards\n\nIn India, two types of PAN cards are available: e-PAN card and physical PAN card.\n\n1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the same PAN details as a physical PAN card but is available in a digital format. It can be downloaded online a

In [None]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(documents, embeddings)

#search_kwargs={"k":2}
retriever = db.as_retriever()
# create a chain to answer questions


In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo")
from langchain.chains import RetrievalQA
#db.as_retriever()
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever)
question = "What are the documents required to apply for the new PAN?"
result = qa_chain({"query": question})



In [None]:
print(len(db.similarity_search(question)))



1


In [None]:
result

{'query': 'What are the documents required to apply for the new PAN?',
 'result': "The documents required to apply for a new PAN card are as follows:\n\n1. If you have an Aadhaar card:\n   - No other document is required. You can apply for a PAN card using your Aadhaar card in 10 minutes.\n\n2. If you don't have an Aadhaar card:\n   - Passport (any country) or OCI card\n   - Passport-sized photograph\n   - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE account statement, overseas bank statement, or utility bill)\n\nThese documents are required to complete the application process for a new"}

In [None]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain_prompt = RetrievalQA.from_chain_type(llm,
                                       retriever=retriever,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

result = qa_chain_prompt({"query": question})
result["result"]



'The documents required to apply for a new PAN card are a passport (any country) or OCI card, a passport size photograph, and an overseas address proof with a zip code, such as an Indian NRO/NRE account statement or overseas bank statement or utility bill. Thanks for asking!'

In [None]:
def ask_question_retqa(question):
    ans = qa_chain_prompt({"query": question})
    return ans


In [273]:
#answers_ret_qa = []
count = 0
time.sleep(60)
for question in questions_list:
    time.sleep(30)
    #docs = document_search.similarity_search(query)
    answer = ask_question_retqa(question)
    count+=1
    answers_ret_qa.append(answer)
    time.sleep(30)
    print(count)



30




RateLimitError: ignored

In [274]:
answers_ret_qa

[{'query': 'What are the documents required to apply for the new pan',
  'result': 'The documents required to apply for a new PAN card are a passport (any country) or OCI card, a passport-size photograph, and an overseas address proof with a zip code (such as an Indian NRO/NRE account statement, overseas bank statement, or utility bill). Thanks for asking!'},
 {'query': 'What is the cost/fees of a PAN card?',
  'result': 'The cost of a PAN card is INR 2500 for an e-PAN card and INR 3700 for a physical PAN card. Thanks for asking!'},
 {'query': 'Can I take the delivery of Pan card at Indian address',
  'result': 'Yes, you can take the delivery of your PAN card at an Indian address mentioned in your Aadhaar card.'},
 {'query': 'How long does it usually take to receive the PAN card after applying?',
  'result': 'Once the application for a PAN card is submitted, it takes approximately 3 weeks to receive the PAN card. Thanks for asking!'},
 {'query': 'How to apply for PAN card',
  'result':

In [275]:

len(answers_ret_qa)

30

# Method 3

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain



In [None]:
qa = ConversationalRetrievalChain.from_llm(llm, db.as_retriever())

In [None]:
chat_history = []
result = qa({"question": question, "chat_history": chat_history})



In [None]:
result['answer']


'The documents required to apply for a new PAN card are as follows:\n\n1. Passport (any country) or OCI card\n2. Passport size photograph\n3. Overseas address proof with zip code (supporting documents can include an Indian NRO/NRE account statement, overseas bank statement, or utility bill)'

In [None]:
chat_history = [(query, result["answer"])]
query = "what can the supporting documents be?"
result = qa({"question": query, "chat_history": chat_history})



In [None]:
result

{'question': 'what can the supporting documents be?',
 'chat_history': [('What is the cost/fees of a PAN card?',
   'The documents required to apply for a new PAN card are as follows:\n\n1. Passport (any country) or OCI card\n2. Passport size photograph\n3. Overseas address proof with zip code (supporting documents can include an Indian NRO/NRE account statement, overseas bank statement, or utility bill)')],
 'answer': 'The following documents can be used as supporting documents for a PAN card application:\n\n1. Passport (Any Country) / OCI Card\n2. Passport Size Photograph\n3. Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)\n4. Copy of Existing PAN card (for PAN card correction/updation)\n5. Bank Account Statement in the country of residence\n6. NRE Bank Account Statement in India\n7. Residential Permit\n8. Visa granted and Copy of appointment letter/contract from Indian Company & Certificate (in

In [None]:
def ask_question_retqa(question):
    ans = qa({"question": query, "chat_history": chat_history})
    return ans

## Evaluation

In [279]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB

In [280]:
from sentence_transformers import SentenceTransformer, util
eval_model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [288]:
import numpy
cos_similarity = []
for i in range(len(g_ans)):
    #print(i)
    sent1 = answers_qa[i]
    sent2 = g_ans[i]
    if sent1 == " I don't know.":
        continue
    else:
        embeddings1 = eval_model.encode(sent1, convert_to_tensor=True)
        embeddings2 = eval_model.encode(sent2, convert_to_tensor=True)

        #Compute cosine-similarities
        cosine_scores = util.cos_sim(embeddings1, embeddings2)
        print(cosine_scores)
        cosine_scores.numpy()[0][0]
        cos_similarity.append(cosine_scores.numpy()[0][0])

tensor([[0.5711]])
tensor([[0.7324]])
tensor([[0.8376]])
tensor([[0.8054]])
tensor([[0.6898]])
tensor([[0.5881]])
tensor([[0.9021]])
tensor([[0.8216]])
tensor([[0.9171]])
tensor([[0.8329]])
tensor([[0.9172]])
tensor([[0.8509]])
tensor([[0.7330]])
tensor([[0.7992]])
tensor([[0.8688]])
tensor([[0.7645]])
tensor([[0.6726]])
tensor([[0.6509]])
tensor([[0.6283]])
tensor([[0.6085]])
tensor([[0.5303]])
tensor([[0.4838]])
tensor([[0.9494]])
tensor([[0.7338]])
tensor([[0.7262]])
tensor([[0.7005]])
tensor([[0.6042]])
tensor([[0.9657]])
tensor([[1.0000]])
tensor([[0.8461]])
tensor([[0.8243]])
tensor([[0.9058]])


In [291]:
average_similarity = numpy.average(cos_similarity)
average_similarity

0.7644384

#### RetrievalQA Evaluation

In [299]:
ret_ans = answers_ret_qa[0]['result']
ret_ans

'The documents required to apply for a new PAN card are a passport (any country) or OCI card, a passport-size photograph, and an overseas address proof with a zip code (such as an Indian NRO/NRE account statement, overseas bank statement, or utility bill). Thanks for asking!'

In [310]:
cleaned_ans = []
for i in range(len(answers_ret_qa)):
    ans = answers_ret_qa[i]['result']
    #removing "thanks for asking!"" from every answer
    sub_list = ["Thanks",  "for", "asking!"]
    b = ans.split()
    x = []
    for i in b:
        if i not in sub_list:
            x.append(i)
    res = " ".join(x)
    #print(res)
    cleaned_ans.append(res)

In [316]:

#Evaluating the first 20 answers as after this the API started running out
import numpy
cos_similarity_ret = []
for i in range(21):
    #print(i)
    sent1 = cleaned_ans[i]
    sent2 = g_ans[i]

    embeddings1 = eval_model.encode(sent1, convert_to_tensor=True)
    embeddings2 = eval_model.encode(sent2, convert_to_tensor=True)

    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    print(cosine_scores)
    cosine_scores.numpy()[0][0]
    cos_similarity_ret.append(cosine_scores.numpy()[0][0])


tensor([[0.6889]])
tensor([[0.7468]])
tensor([[0.9384]])
tensor([[0.8084]])
tensor([[0.7426]])
tensor([[0.7253]])
tensor([[0.9066]])
tensor([[0.9555]])
tensor([[0.8824]])
tensor([[0.9886]])
tensor([[0.8844]])
tensor([[0.8616]])
tensor([[1.]])
tensor([[0.7992]])
tensor([[0.8601]])
tensor([[0.8054]])
tensor([[0.7461]])
tensor([[0.8190]])
tensor([[0.7153]])
tensor([[0.5007]])
tensor([[0.7573]])


In [317]:
avg_similarity = numpy.average(cos_similarity_ret)
avg_similarity

0.8158429