# LangChain Pinecone OpenAI - Query Multiple PDF Files + Cite Source Documents 

#### This notebook walks through the basics of using Pinecone, OpenAI and LangChain to query your own text document 


## pip install dependencies

In [None]:
pip install langChain

In [None]:
pip install OpenAI

In [None]:
pip install pinecone-client

In [None]:
pip install tiktoken

### Set environment variables and keys

In [1]:
# KEYS, MODELS and ENV Related Settings 

import os
os.environ["OPENAI_API_KEY"] = ""
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

embed_model = "text-embedding-ada-002"

os.environ["PINECONE_API_KEY"] = ""
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_ENV = "us-west4-gcp-free"


### Import required modules

In [2]:
import openai, langchain, pinecone


from langchain.document_loaders import DirectoryLoader, TextLoader,UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

from langchain.llms import OpenAI

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

  from tqdm.autonotebook import tqdm


### Import your own text file

In [3]:
# Open the data file and read its content

loader = DirectoryLoader('../data/GenAIReports', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()


### Split the text using RecursiveCharacterTextSplitter to be able to work with the 4096 OpenAI token limit

In [4]:
# Set up the RecursiveCharacterTextSplitter, then Split the documents

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [5]:
type(texts)

list

In [12]:
len(texts)

336

In [7]:
texts[100]

Document(page_content='How customer operations \ncould be transformed\nCustomer self-service interactions\nCustomer interacts with a humanlike chatbot that \ndelivers immediate, personalized responses to complex inquiries, ensuring a consistent brand voice regardless of customer language or location. \nCustomer–agent interactions\nHuman agent uses AI-developed call scripts and receives real-time assistance and suggestions for responses during phone conversations, instantly accessing relevant customer data for tailored and real-time information delivery. \nAgent self-improvement\nAgent receives a summarization of the conversation in a few succinct points to create a record of customer complaints and actions taken.\nAgent uses automated, personalized insights generated \nby AI, including tailored follow-up messages or personalized coaching suggestions. \n14 The economic potential of generative AI: The next productivity frontier', metadata={'source': '../data/GenAIReports/McKinsey-the-eco

In [8]:
texts[100].metadata

{'source': '../data/GenAIReports/McKinsey-the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf',
 'page': 15}

In [9]:
texts[100].metadata['source']

'../data/GenAIReports/McKinsey-the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf'

### Pinecone and OpenAI Embedding setup

In [10]:
# Pinecone related setup

pinecone.init(
        api_key = PINECONE_API_KEY,
        environment = PINECONE_ENV
)

# Set the index name for this project in pinecone first

index_name = 'generative-ai-reports'


In [11]:
# Examine pinecone index. Delete all vectors, if you want to start fresh

index = pinecone.Index(index_name)
index.describe_index_stats()
#index.delete(deleteAll='true', namespace='')

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 336}},
 'total_vector_count': 336}

In [13]:
# Prepare the embedding so that we can pass it to the pinecone call in the next step

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [14]:
# Create the vector store from the texts

docsearch = Pinecone.from_documents(texts, embeddings, index_name = index_name)

# for existing an vector store, use Pinecone.from_existing_index(index_name, embeddings)

In [15]:
# Don't run this cell

if index_name not in pinecone.list_indexes():
    print("Index does not exist: ", index_name)
    # docsearch = Pinecone.from_documents(texts, embeddings, index_name = index_name)
else:
    print("Index exists: ", index_name)
    # docsearch = Pinecone.from_existing_index(index_name, embeddings)


Index exists:  generative-ai-reports


In [16]:
type(docsearch)

langchain.vectorstores.pinecone.Pinecone

### Import  load_qa_chain from LangChain

In [17]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

In [18]:
# set up the llm model to use with our chain/agent

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

### Ask questions to your document and get the answer

In [19]:
# Helper function to process the response from the QA chain 
# and isolate result and source docs and page numbers

def parse_response(response):
    print(response['result'])
    print('\n\nSources:')
    for source_name in response["source_documents"]:
        print(source_name.metadata['source'], "page #:", source_name.metadata['page'])

In [20]:
# Set up the retriever on the pinecone vectorstore
# Make sure to set include_metadata = True

retriever = docsearch.as_retriever(include_metadata=True, metadata_key = 'source')

In [21]:
# Set up the RetrievalQA chain with the retriever
# Make sure to set return_source_documents = True

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)


In [22]:
# Let's set up the query 

query = "List all the use cases for Generative AI?"


In [23]:
# Call the QA chain to get the response

response = qa_chain(query)

In [24]:
type(response)

dict

In [26]:
response

{'query': 'List all the use cases for Generative AI?',
 'result': ' Generative AI can have applications in customer self-service, IT, audit, human resources, operations, and many more business functions.',
 'source_documents': [Document(page_content='The following are examples of the operational improvements generative AI can have for \nspecific use cases:\n —Cu\nstomer self-service. Generative AI–fueled chatbots can give immediate and \npersonalized responses to complex customer inquiries regardless of the language or location of the customer. By improving the quality and effectiveness of interactions via automated channels, generative AI could automate responses to a higher percentage of customer inquiries, enabling customer care teams to take on inquiries that can only be resolved by a human agent. Our research found that roughly half of customer contacts made by banking, telecommunications, and utilities companies in North America are already handled by machines, including but not 

In [27]:
parse_response(response)

 Generative AI can have applications in customer self-service, IT, audit, human resources, operations, and many more business functions.


Sources:
../data/GenAIReports/McKinsey-the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf page #: 16.0
../data/GenAIReports/McKinsey-the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf page #: 16.0
../data/GenAIReports/KPMG-generative-ai-models-the-risks-and-potential-rewards-in-business.pdf page #: 6.0
../data/GenAIReports/KPMG-generative-ai-models-the-risks-and-potential-rewards-in-business.pdf page #: 6.0


In [28]:
response["source_documents"]

[Document(page_content='The following are examples of the operational improvements generative AI can have for \nspecific use cases:\n —Cu\nstomer self-service. Generative AI–fueled chatbots can give immediate and \npersonalized responses to complex customer inquiries regardless of the language or location of the customer. By improving the quality and effectiveness of interactions via automated channels, generative AI could automate responses to a higher percentage of customer inquiries, enabling customer care teams to take on inquiries that can only be resolved by a human agent. Our research found that roughly half of customer contacts made by banking, telecommunications, and utilities companies in North America are already handled by machines, including but not exclusively AI. We estimate that generative AI could further reduce the volume of human-serviced contacts by up to 50 percent, depending on a company’s existing level of automation. \n —R', metadata={'page': 16.0, 'source': '..

### Let's accomplish the same task using a vectorstore agent

In [29]:
# Import the dependencies

from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)

In [30]:
# Set up the vectorstore info

vectorstore_info = VectorStoreInfo(
    name="Generative AI Reports",
    description="Reports on the State and Trends in Generative AI",
    vectorstore= docsearch,
)

In [31]:
# Setup the VectorStoreToolkit and VectorStore Agent

toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
agent_executor = create_vectorstore_agent(llm=llm, toolkit=toolkit, verbose=False)

In [32]:
# Add the string to ask for source

query = query + " List the sources."
print (query)

List all the use cases for Generative AI? List the sources.


In [33]:
# Run the agent

response = agent_executor.run(query)

In [34]:
type(response)

str

In [35]:
response

'Generative AI can be used for customer self-service, IT, audit, human resources, operations, and other business functions. Sources: McKinsey-the-economic-potential-of-generative-ai-the-next-productivity-frontier-vf.pdf, KPMG-generative-ai-models-the-risks-and-potential-rewards-in-business.pdf'