# 1. Import Libraries

In [1]:
import openai
import langchain

from langchain.document_loaders import PyPDFDirectoryLoader #PDF Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter # Text Splitting/Chunking
from langchain_openai import OpenAIEmbeddings #Embedding

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

# 2. Load and Read Document

In [3]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    docs = file_loader.load()
    return docs

doc = read_doc('documents/')

In [4]:
len(doc)

58

# 3. Divide the docs into chunks

In [5]:
def chunk_data(docs, chunk_size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs
    
documents = chunk_data(docs=doc)

In [6]:
len(documents)

58

# 4. Embedding Technique and Vector Search DB in Pinecone

In [7]:
import os
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

In [10]:
vectors = embeddings.embed_query("How are you?")
print(len(vectors))

1536


In [9]:
import os
from langchain_pinecone import PineconeVectorStore

os.environ["PINECONE_API_KEY"]='pcsk_4tfXCG_4DhfeBtcSKsKxLHtYvUbXosV8A8PuXCTWuVc6tQ7HA5Gtok9hJ9NVp6gzYvLZCC'

index_name = "ragpinecone"

vs = PineconeVectorStore(index_name=index_name,embedding = embeddings)
vs.add_documents(documents)

['70a044f2-3a3a-424c-a2da-3db6c0841cbc',
 '49b7fd72-82b4-446b-87a3-86d11f148b18',
 '8d7c5e3c-2f3a-4c2d-a099-08211861c90d',
 '4f4745e5-3834-4ebd-ab9a-18085f62d45f',
 'bab0c038-d81e-4ebd-99c5-d3d9fe601dbd',
 '083cee6b-d984-41a6-b76a-4de365887d22',
 'd35ef291-92b5-4b65-b07a-4dbdd3458a7d',
 '57093e5f-8979-4c5b-851a-93f0ebbf6ad0',
 '246e2607-a6d2-42a5-b6a7-a176dba5605e',
 '0b29bf42-32a2-4b3a-a9b6-1f3b0faaa8a8',
 'ba4c4add-9eb3-4b4d-867d-f8331fbf02c0',
 '64da1cab-a15d-40eb-b2d6-2310bee8cff0',
 '2c353416-3a05-4863-9d3e-71c4100d4062',
 '00de21ce-544f-45f5-b7e9-f086009a0f15',
 'f283ee39-8cf9-4540-8e23-e5058757d525',
 '9ae18c1f-9b96-4cc9-8d85-876692678990',
 '85dee83a-097e-4476-8da7-ffccf15e4224',
 'f528987f-126f-463f-944a-11e43e964f0d',
 '5279e8ce-65ff-4375-9489-ad2b7ee26e16',
 '355f2fb2-dd10-45d3-8719-cfaeb90fc3e3',
 '8be928e4-5f4c-448a-bf36-a5146b6c6a69',
 '3e94d06a-654a-4fb3-b4db-306de215b0e7',
 'd4667d27-ad71-4a15-a547-55efe0f7ff87',
 '87a1c73b-cf41-4d2b-9fc3-ac509e853f33',
 '81e7771c-50c9-

# 5. Query and LLM

In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")

In [12]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=10):
    matching_results=vs.similarity_search(query,k=k)
    return matching_results

## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [13]:
our_query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='ba4c4add-9eb3-4b4d-867d-f8331fbf02c0', metadata={'creationdate': '2023-02-01T05:28:04+05:30', 'creator': 'Adobe Acrobat Pro 10.1.16', 'moddate': '2023-02-01T08:28:21+05:30', 'page': 10.0, 'page_label': '11', 'producer': 'Adobe Acrobat Pro 10.1.16', 'source': 'documents/budget_speech.pdf', 'title': '', 'total_pages': 58.0}, page_content="7 \n \n \n \nfarmers in contributing to the health of fellow citizens by growing these \n‘Shree Anna’.  \n22. Now to make India a global hub for ' Shree Anna', the Indian Institute \nof Millet Research, Hyderabad will be supported as the Centre of Excellence \nfor sharing best practices, research and technologies at the international \nlevel.    \nAgriculture Credit  \n23. The agriculture credit target will be increased  \nto ` 20 lakh crore with focus on animal husbandry, dairy and fisheries.  \nFisheries \n24. We will launch a new sub-scheme of PM Matsya Sampada Yojana \nwith targeted investment of ` 6,000 crore to further enable activit

  response=chain.run(input_documents=doc_search,question=query)


 The agriculture credit target will be increased to ` 20 lakh crore.
