1. The pdf act as the corpus to the llm model
2. This pdf is then converted into text chunks
3. Then the openAI Embeddings will convert these text chunks into vectors
4. Using vector search database to store these vectors for efficient queries

## Importing Libraries

In [11]:
import openai
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader # to load pdf from directory
from langchain.text_splitter import RecursiveCharacterTextSplitter # for text splitting to remain inside the token size
from langchain.embeddings.openai import OpenAIEmbeddings # converting chunks into vectors
from langchain_pinecone import PineconeVectorStore,Pinecone # for creating a vector store
from langchain_openai import OpenAI

## Loading Environment variables

In [13]:
import os
from dotenv import load_dotenv
load_dotenv()

True

## Reading the PDF file

In [17]:
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [18]:
doc=read_doc(os.path.join('documents'))

## Converting Document to Chunks

In [19]:
def chunk_data(docs,chunk_size=300,chunk_overlap=20):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    docs=text_splitter.split_documents(docs)
    return docs

In [20]:
documents=chunk_data(docs=doc)

In [21]:
len(documents)

6939

## Embedding the documents

In [22]:
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_KEY'])

In [8]:
vectors=embeddings.embed_query("Hari Bol")

In [9]:
len(vectors)

1536

In [16]:
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_KEY')
index_name="bhagwat-gita-search"

In [26]:
# vectorStore=Pinecone.from_documents(document,embeddings,index_name=index_name) -> to load the document
vectorStore=Pinecone(index_name=index_name,embedding=embeddings)

## Retrieve results from VectorDB

In [27]:
def retrieve_query(query,k=10):
    matching_reults=vectorStore.similarity_search(query,k=k)
    return matching_reults

In [29]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [28]:
llm = OpenAI()
chain = load_qa_chain(llm=OpenAI(), chain_type="stuff") # to create a Q&A application

## Making Queries to VectorDB

In [30]:
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents=doc_search,question=query)
    return response

In [32]:
query="In how many yog in Bhagwat Gita Divided?"
answer=retrieve_answers(query=query)
print(answer)

[Document(page_content='Śrīmad-Bhagavad-gītā\tin\tthe\tmatter\tof\tthe\tDivisions\tof\tFaith.', metadata={'page': 818.0, 'source': 'documents\\English-Bhagavad-Gita.pdf'}), Document(page_content='BG-17\nCHAPTER\tSEVENTEEN\nThe\tDivisions\tof\tFaith\nTEXT\t1\nअ ज Ʋƨन\tउ वा च\t।\nŏ\tशा Źe वe धमƲü सƼßय\tयज ĭú\tǪĚया ि ĭवता ः\t।\núष Ɗ\te न űा \tतƲ\tक ा\tक ƺŲण\tसĄवमा हो \tर ज ŵतमः\t॥१॥\narjuna\tuvāca\nye\tśāstra-vidhim\tutsṛjya\nyajante\tśraddhayānvitāḥ\nteṣāṁ\tniṣṭhā\ttu\tkā\tkṛṣṇa\nsattvam\tāho\trajas\ttamaḥ', metadata={'page': 792.0, 'source': 'documents\\English-Bhagavad-Gita.pdf'}), Document(page_content='divisible\t into\t an\t eightfold\t procedure\t called\t yama,\t niyama,\t āsana,\nprāṇāyāma,\t pratyāhāra,\t dhāraṇā,\t dhyāna,\t and\t samādhi.\t In\t the\t Sixth\nChapter\tthe\tsubject\tof\t yoga\tis\texplicitly\tdetailed,\tand\tat\tthe\tend\tof\tthe\tFifth', metadata={'page': 324.0, 'source': 'documents\\English-Bhagavad-Gita.pdf'}), Document(page_content='in\tthe\tBhāgavatam\tthe\