## PDF Query Using Langchain

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.0.338-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.65-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langch

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
import os
os.environ["OPENAI_API_KEY"] = SECRET_KEY
os.environ["SERPAPI_API_KEY"] = SECRET_KEY

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('/content/TIAA_govt_schemes.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

"The government introduces new investment plans to increase its residents’ income and \nfinancial status. These new government schemes are available to everyone who wants to \nparticipate, regardless of gender, marital status, socioeconomic status, location, et c. However, \nit is up to the residents to analyze several plans and select the one that best meets their \nrequirements to maximize their income flow.  \nInvesting in new government schemes provides the most significant benefit of being free of \nrisk and easy. Post offices and banks all around India make it possible for anyone interested \nin signing up for any government program they want.  Government  investment  \nschemes  typically result in tax breaks for the government and the investor. Investors would \nbe well to compare the various strategies before deciding on the one that promises the highest \nreturn.  \nWhat are Government Investment Schemes?  \nIndia's government and various public sector financial organizations 

In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

26

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings(openai_api_key = SECRET_KEY)

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
document_search


<langchain.vectorstores.faiss.FAISS at 0x7aa7ce0e1f90>

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(openai_api_key = SECRET_KEY), chain_type="stuff")

In [None]:
query = "Sovereign gold bonds"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Sovereign Gold Bonds are issued by the Reserve Bank of India on behalf of the Indian Government. They are gold-backed government bonds and provide a set interest of 2.5% yearly on the issue price, in addition to the price fluctuation gain. They are a paper-based instrument and can be used for secured loans with them as collateral. They provide a similar Loan to Value ratio as a loan secured by actual gold and redemption is permitted after the fifth year.'

In [None]:
query = "who is the national pension scheme for?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The National Pension Scheme (NPS) is available to all Indians including NRIs (Non-Resident Indians) between the age of 18 to 60.'

In [None]:
from langchain.document_loaders import OnlinePDFLoader

In [None]:
loader = OnlinePDFLoader("https://pensionersportal.gov.in/Document/Retirement_benefits_in_one_click.pdf")

In [None]:
!pip install unstructured



In [None]:
!pip install pdf2image pdfminer.six



In [None]:
!pip install unstructured_pytesseract
!pip install unstructured_inference

Collecting unstructured_inference
  Downloading unstructured_inference-0.7.14-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting layoutparser[layoutmodels,tesseract] (from unstructured_inference)
  Downloading layoutparser-0.3.4-py3-none-any.whl (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-multipart (from unstructured_inference)
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting onnx (from unstructured_inference)
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
from PIL import Image

import pytesseract

In [None]:
from pdfminer.utils import open_filename
data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
data

[Document(page_content='GOVERNMENT OF INDIA DEPARTMENT OF PENSION & PENSIONERS’ WELFARE\n\nRetirement Benefits Retirement Benefits in One Click in One Click Our Efforts Towards Dignified Retirement\n\nm«r�,\n\n�, mtli �lcfillM a-m �r-ntrrnc:J,\n\nsf o � fti1cu;il, �- �. �\xad � Dr. Kshatrapati Secretary Tel.: 011-23742133 Fax: 011-23742546 Email :\n\nShivaji,\n\nIAS\n\nsecy-arpg@nic.in\n\n� � � cfi�IOI fu\'mlT, Mlcfi-tl�cfi \'qcR, "{g\'R llffig, � �-110003\n\nGOVERNMENT OF PERSONNEL,\n\nOF INDIA, PUDLIC GRIEVANCES\n\nMINISTRY\n\n& PENSIONS,\n\nDEPARTMENT OF PENSION & PENSIONERS\'\n\nWELFARE\n\nLOK NAYAK BHAWAN, KHAN MARKET,\n\nNEW DELHl-110003\n\nFOREWORD\n\nDepartment\n\nof Pension\n\ntechnology\n\nWelfare has been leveraging for the elderly towards\n\nyear after\n\nand Pensioners\n\nyear, to make the system taking other steps for bringing for them and their dependents. Service include Department This is considered be given from the comfort of one\'s home also.\n\nPensioners, life of 

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings(openai_api_key = SECRET_KEY)

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.17-py3-none-any.whl (496 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m496.8/496.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.104.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.24.0.post1-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting posthog>=2.4.0 (from chromadb)
  Downloading posth

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Explain me about Right of President to withhold or withdraw pension1"
index.query(query)

' The President reserves the right to withhold or withdraw a pension or gratuity, either in full or in part, or to withdraw a pension in full or in part, whether permanently or for a specified period. This can be done if the pensioner is found guilty of grave misconduct or negligence during the period of service, including service rendered upon re-employment after retirement. The Union Public Service Commission must be consulted before any final orders are passed, and the amount of pension withheld or withdrawn must not reduce the amount of minimum family pension.'