## Personal Knowledge Base Expert

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(find_dotenv(), override=True)

True

### 1. Prepare the Document

#### 1.1 Data Loading

In [4]:
"""
Loads a PDF as an array of LangChain documents given a file_path
"""
def load_pdf(file_path):
    from langchain.document_loaders import PyPDFLoader
    print(f"Loading {file_path}")
    loader = PyPDFLoader(file_path)
    return loader.load()

"""
Loads load_max_docs number of Wikipedia pages relevant to query in lang as an array of LangChain documents
"""
def load_from_wikipedia(query, lang="en", load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    return loader.load()

In [5]:
documents = load_pdf("../data/us_constitution.pdf")
documents[:5]

Loading ../data/us_constitution.pdf


[Document(page_content=' \nNATIONAL  CONSTITUTION  CENTER   \n   \n \n \n \n \n  \n \nTHE  \nCONSTITUTION  \nof the United  States \n \n \n \n \n \n  \n \n  \n \n   \n \n  \n \n  \n \n  \n \n  \n \n  \n \n   \n ', metadata={'source': '../data/us_constitution.pdf', 'page': 0}),
 Document(page_content='C O N S T I T U T I O N O F T H E U N I T E D S T A T E S   \n \n \n \nWe the People of the United States, in Order to form a \nmore perfect Union, establish Justice, insure domestic \nTranquility, provide for the common defence, promote \nthe general  Welfare, and secure the Blessings of Liberty to \nourselves  and our Posterity,  do ordain  and establish  this \nConstitution for the United States of America  \n \n \nArticle.   I. \nSECTION.  1 \nAll legislative Powers herein granted shall be vested in a \nCongress of the United States, which shall consist of a Sen-  \nate and House of Representatives. \nSECTI ON. 2 \nThe House of Representatives shall be composed of Mem-  \nbers chosen e

In [6]:
len(documents)

19

#### 1.2 Data Splitting

In [7]:
"""
Given an array of documents, creates a new array of documents with the specified chunk_size
"""
def chunk_documents(documents, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(documents)
    return chunks
    # split_documents: splits each document in an array into new documents of the defined chunk size; documents -> documents
    # create_documents: splits text into an array of documents; text -> documents

In [8]:
chunked_documents = chunk_documents(documents)
chunked_documents[:5]

[Document(page_content='NATIONAL  CONSTITUTION  CENTER   \n   \n \n \n \n \n  \n \nTHE  \nCONSTITUTION  \nof the United  States', metadata={'source': '../data/us_constitution.pdf', 'page': 0}),
 Document(page_content='C O N S T I T U T I O N O F T H E U N I T E D S T A T E S   \n \n \n \nWe the People of the United States, in Order to form a \nmore perfect Union, establish Justice, insure domestic \nTranquility, provide for the common defence, promote', metadata={'source': '../data/us_constitution.pdf', 'page': 1}),
 Document(page_content='the general  Welfare, and secure the Blessings of Liberty to \nourselves  and our Posterity,  do ordain  and establish  this \nConstitution for the United States of America  \n \n \nArticle.   I. \nSECTION.  1', metadata={'source': '../data/us_constitution.pdf', 'page': 1}),
 Document(page_content='All legislative Powers herein granted shall be vested in a \nCongress of the United States, which shall consist of a Sen-  \nate and House of Representati

In [9]:
len(chunked_documents)

247

#### 1.3 Chunk Embedding & Storage

In [10]:
"""
Given an array of documents, calculates the cost of embedding using ada-002
"""
def print_embedding_cost(documents):
    import tiktoken
    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
    total_tokens = sum([len(enc.encode(document.page_content)) for document in documents])
    print(f"Total tokens: {total_tokens}")
    print(f"Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}")

In [11]:
print_embedding_cost(chunked_documents)

Total tokens: 12912
Embedding cost in USD: 0.005165


In [12]:
"""
Creates a Pinecone index of index_name and a vector store
"""
def create_index_and_embeddings(index_name, documents):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENV"))

    if index_name in pinecone.list_indexes():
        print(f"{index_name} already exists. Loading embeddings...")
        vector_store = Pinecone.from_documents(documents, index_name=index_name, embedding=embeddings)
    else:
        print(f"Creating {index_name} and loading embeddings...")
        pinecone.create_index(index_name, dimension=1536, metric="cosine")
        vector_store = Pinecone.from_documents(documents, index_name=index_name, embedding=embeddings)

    return vector_store

"""
Deletes a Pinecone index
"""
def delete_index(index_name="all"):
    import pinecone
    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENV"))

    if index_name == "all":
        print("Deleting all indexes...")
        for index in pinecone.list_indexes():
            pinecone.delete_index(index)
        print("Done.")
    else:
        print(f"Deleting {index_name}...")
        pinecone.delete_index(index)
        print("Done.")

In [13]:
delete_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes...
Done.


In [15]:
index_name = "ask-a-document"
vector_store = create_index_and_embeddings(index_name, chunked_documents)

ask-a-document already exists. Loading embeddings...


### 2. Search

#### 2.1 Embed user query

#### 2.2 Use query embedding to find similar vector embeddings in storage

### 3. Ask

#### 3.1 Insert the question and the relevant chunks into a message to the GPT model

#### 3.2 Return the GPT's answer

In [19]:
def ask(vector_store, query):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=1
    )
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    return chain.run(query)

In [20]:
query = "What is the whole document about?"
answer = ask(vector_store, query)
answer

'The document in question is the Constitution of the United States. It establishes the framework of the federal government and outlines the powers and limitations of its three branches: the legislative, executive, and judicial branches. The Constitution also guarantees certain rights and freedoms to the people of the United States, as further detailed in the Bill of Rights, which is a set of amendments to the Constitution.'

In [18]:
import time
i = 1
print("Enter q to quit")
while True:
    query = input(f"Question #{i}: ")
    if query.lower() == "q": 
        print("Quitting... Bye!")
        time.sleep(2)
        break
    answer = ask(vector_store, query)
    print(answer)
    print("-" * 50)

Enter q to quit
Based on the context provided, some of the rights mentioned include:

1. The right to free speech and expression.
2. The right to peacefully assemble and petition the government.
3. The right to bear arms.
4. The right to a fair trial and due process.
5. The right to be secure in one's person and property, protected from unreasonable searches and seizures.
6. The right to not be forced to incriminate oneself.
7. The right to protection against cruel and unusual punishment.
8. The right to not have one's private property taken for public use without just compensation.
9. The right to not have soldiers quartered in one's home during times of peace without consent.
10. The right to have the elected President of the United States preserve, protect, and defend the Constitution. 

Please note that the above-listed rights are not exhaustive, and there are more rights protected under the United States Constitution.
--------------------------------------------------
Based on the

### 4. Add memory

In [25]:
def ask_with_memory(vector_store, query, history=[]):
    from langchain.chains import ConversationalRetrievalChain # Built on RetrievalQA, but adds history to the context
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    crc = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    result = crc({"question": query, "chat_history": history})
    history.append((query, result["answer"]))

    return result, history


In [30]:
history = []
question = "How many amendments are there in the constitution?"
result, history = ask_with_memory(vector_store, question, history)

print(result["answer"])
print(history)

There are currently 27 amendments in the Constitution of the United States.
[('How many amendments are there in the constitution?', 'There are currently 27 amendments in the Constitution of the United States.')]


In [31]:
question = "Multiply that by 2"
result, history = ask_with_memory(vector_store, question, history)

print(result["answer"])
print(history)

The number of amendments in the Constitution is 27. If you multiply this number by 2, the result is 54.
[('How many amendments are there in the constitution?', 'There are currently 27 amendments in the Constitution of the United States.'), ('Multiply that by 2', 'The number of amendments in the Constitution is 27. If you multiply this number by 2, the result is 54.')]
