In [None]:
pip install langChain OpenAI faiss-cpu tiktoken

In [None]:
pip install python-dotenv

In [None]:
pip install langchain-community

In [None]:
pip install langchain-openai

In [None]:
pip install pypdf

In [4]:
from dotenv import load_dotenv
import os

load_dotenv() 
api_key = os.getenv("OPEN_AI_KEY")

In [21]:
import openai, langchain, faiss, tiktoken, pypdf

from langchain_community.document_loaders import DirectoryLoader, TextLoader, UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

from langchain.llms import OpenAI

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA


### import own pdf file

In [22]:
loader = DirectoryLoader("data" , glob="./*.pdf" ,loader_cls=PyPDFLoader)
document = loader.load() 

## split the text 

In [23]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(document)

In [None]:
type(texts)

In [None]:
len(texts)

In [None]:
texts[100]

In [None]:
texts[100].metadata

In [None]:
texts[100].metadata['source']

### OpenAi Embedding and VectorDB

In [33]:
embeddings = OpenAIEmbeddings(api_key=api_key)

In [34]:
# create vector from text

docsearch = FAISS.from_documents(texts, embeddings)

In [35]:
query1 = "what are edges and pulses"

In [36]:
answer = docsearch.similarity_search(query1)

In [None]:
print(answer[3].page_content)

In [None]:
answer_score = docsearch.similarity_search_with_score(query1) # closer to 0 is better

In [None]:
answer_score[0]

### import chat

In [45]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

In [51]:
## setup the llm model

llm = OpenAI(temperature=0, api_key=api_key)

### get answers from your own docs 

In [76]:
# helper function to process the response from the QA chain
# and isloate result and source docs and page number

def parse_response(response):
    print(response['result'])
    print('\n\nSource:')
    for source_name in response['source_documents']:
        print(source_name.metadata['source'], "page #:", source_name.metadata['page'])

In [57]:
# setup the retriever on the faiss vector store
# make sure to set include_metadata = True

retriever = docsearch.as_retriever(include_metadata=True, metadata_key='source')

In [69]:
# setup the RetrieverQA chain with the retriever 
# make sure to return_source_documents = True

qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       return_source_documents=True)

In [70]:
query = "what are edges and pulses"

In [71]:
response = qa_chain(query)

In [None]:
type(response)

In [None]:
response

In [None]:
parse_response(response)

In [None]:
response['source_documents']

### use Vector agent

In [91]:
# import the dependencies

from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo
)

In [80]:
# set up the vectorstore info

vectorstore_info = VectorStoreInfo(
    name="pdf_vectorstore",
    description="pdf vectorstore",
    vectorstore=docsearch,
)

In [102]:
# Setup the VectorStoreToolkit and VectorStoreAgent

toolkit = VectorStoreToolkit(llm=llm, vectorstore_info=vectorstore_info)
agent_executor = create_vectorstore_agent(llm=llm,
                                          toolkit=toolkit,
                                          verbose=False)

In [None]:
# Add the string to ask for source

query = query + " List the sources."
print(query)

In [113]:
# run the agent

response = agent_executor.run(query)

In [None]:
type(response)

In [None]:
response