In [11]:
#materials
#https://python.langchain.com/docs/modules/data_connection/document_loaders

## PROJECT: Question-Answering on Private Documents

In [1]:
import os 
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override = True)

True

In [2]:
#pip install pypdf

In [3]:
#pip install Docx2txt -q

## PDF Loader only

In [2]:
### ensuring code is modular, start by defining a function
## defining a funtion that only loads pdfs
def load_document_pdf(file):
    #takes pdf file and return it text
    from langchain.document_loaders import PyPDFLoader
    print(f'Loading {file}') ## leaving a message for the user
    loader = PyPDFLoader(file)
    data = loader.load()
    return data

In [3]:
## testing pdf loader function
data = load_document_pdf('files/us_constitution.pdf')
# using indexes to print pages e.g.
print(data[10].page_content)

Loading files/us_constitution.pdf
C O N S T I T U T I O N O F T H E U N I T E D S T A T E S  DELAWARE  
Geo:  Read 
Gunning Bedford  jun 
John Dickinson 
Richard Bassett  
Jaco: Broom  
MARYLAND  
James  McHenry  
Dan of St. Thos.  Jenifer 
Danl Carroll  
VIRGINIA  
John Blair - 
James  Madison  Jr. 
NORTH  CAROLINA  
Wm.  Blount  
Richd.  Dobbs  Spaight 
Hu Williamson  
SOUTH  CAROLINA  
J. Rutledge  
Charles  Cotesworth  Pinckney 
Charles Pinckney  
Pierce Butler  
GEORGIA  
William  Few 
Abr Baldwin  
 
Attest  William  Jackson Secretary  In Convention Monday 
September 17th, 1787. 
Present  
The States  of 
New  Hampshire,  Massachusetts,  Connecticut,  Mr. Ham - 
ilton from  New York, New Jersey, Pennsylvania, Delaware, 
Maryland, Virginia, North Carolina, South Carolina and Georgia.
 
Resolved,  
That the preceeding Constitution be laid before the United 
States  in Congress  assembled,  and that it is the Opinion 
of this Convention, that it should afterwards be submitted  
to a

In [5]:
## to see metadata of pages e.g pg 1
print(data[1].metadata)

{'source': 'files/us_constitution.pdf', 'page': 1}


## Other Transform loaders

In [50]:
## defining a function that loads both pdfs and document, more ext can be added, e.g. csv, ht
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}') ## leaving a message for the user
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}') ## leaving a message for the user
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
        return None
        
    data = loader.load()
    return data

In [7]:
# to load remote pdf, add the url to the function
remotdata = load_document('https://www.uscis.gov/sites/default/files/document/guides/M-654.pdf')

Loading https://www.uscis.gov/sites/default/files/document/guides/M-654.pdf


In [8]:
print(remotdata[10].page_content)

 Journal, and proceed to reconsider it. If after such 
Reconsideration two thirds of that house shall 
agree to pass the Bill, it shall be sent, together with the Objections, to the other house, by which it 
shall likewise be reconsidered, and if approved by two thirds of that house, it shall become a Law. But 
in all such Cases the Votes of both houses shall be 
determined by yeas and Nays, and the Names of 
the Persons voting for and against the Bill shall be entered on the Journal of each house respectively . 
If any Bill shall not be returned by the President 
within ten Days (Sundays excepted) after it shall 
have been presented to him, the Same shall be a 
Law, in like Manner as if he had signed it, unless 
the Congress by their Adjournment prevent its 
Return, in which Case it shall not be a Law.
every Order, Resolution, or Vote to which 
the Concurrence of the Senate and house of 
Representatives may be necessary (except on a 
question of Adjournment) shall be presented to 

In [9]:
#hOW MANY PAGES And characters in pg 13
print(f'You have {len(data)} pages in the data')
print(f'There are {len(data[13].page_content)} character in pg 13')

You have 19 pages in the data
There are 3509 character in pg 13


## Public Loaders

In [6]:
#pip install wikipedia -q

In [10]:
## a function that loads wikipedia article, 
# wikipedia, limit number of document to load with max docs
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query = query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [11]:
data = load_from_wikipedia('GPT-4')
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was initially released on March 14, 2023, and has been made publicly available via the paid chatbot product ChatGPT Plus, and via OpenAI's API.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4 is also capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technical details and statistics about GPT-4, such as the precise siz

In [10]:
#data = load_from_wikipedia('GPT-4', 'de') # german language code as second argument
#print(data[0].page_content)

## Chunking or Splitting Document

In [51]:
## A function that rearrange document into chunks/ splits to regularise return texts by our AI model
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks =  text_splitter.split_documents(data)
    return chunks

In [52]:
# calling the most recent load document function to test the chunk data function

data = load_document('files/us_constitution.pdf')
print(f'You have {len(data)} pages in the data')
print(f'There are {len(data[13].page_content)} character in pg 13')

Loading files/us_constitution.pdf
You have 19 pages in the data
There are 3509 character in pg 13


In [14]:
#Calling the chunk data function on our loaded document
chunks = chunk_data(data)
print(f'You have {len(chunks)} chunks')
print(chunks[10].page_content)

You have 247 chunks
Jersey four, Pennsylvania eight, Delaw are one, Maryland  
six, Virginia ten, North Carolina five, South Carolina five, 
and Georgia three.  
When vacancies happen in the Representation from any


In [15]:
# function that calculate embedding cost beforehand
def print_embedding_cost(texts):
    import tiktoken
    enc= tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}')

In [16]:
print_embedding_cost(chunks)

Total Tokens: 12912
Embedding Cost in USD: 0.005165


In [30]:
#Embedd the document and upload to a database like pinecone to store the vectors

### Embedding and Uploading to a Vector Databases (Pinecone)

In [17]:
#A function that creates embedding and uploads vectors into pinecone db
def insert_or_fetch_embeddings(index_name):
    import pinecone #pincone library
    from langchain.vectorstores import Pinecone #Pincone class in langchain
    from langchain.embeddings.openai import OpenAIEmbeddings #EMBEDs text to vectors

    embeddings = OpenAIEmbeddings()

    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')

    return vector_store

In [18]:
## function that delete indexes, as a single account user, I can use more than 1 index at once

def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes...')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end=' ')
        pinecone.delete_index(index_name)
        print('Ok')
        

In [19]:
## using the delete function to delete any existing index
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes...
Ok


In [53]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index askadocument and embeddings ...Ok


### Asking and getting Answers

In [27]:
## function that queries the vectors to get answers from loaded document
def ask_and_get_answers(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
    
    chains = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chains.run(q)
    return answer

In [28]:
q = 'What is the whole document about?'
answer = ask_and_get_answers(vector_store, q)
print(answer)

The document mentioned is the United States Constitution, which establishes the framework of the government and the rights and freedoms of its citizens. It consists of a preamble and several articles that outline the powers and responsibilities of the three branches of government (legislative, executive, and judicial) and the relationship between the federal government and the states. The document also includes the Bill of Rights, which is a set of amendments that guarantee individual liberties and limit the power of the government.


In [32]:
#creating a loop to allow user ask continuous question from the document
import time
i = 1
print('Write Quit or Exit to quit')
while True:
    q = input(f'Question #{i}: ')
    i = i+1 
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    answer = ask_and_get_answers(vector_store, q)
    print(f'\nAnswer: {answer}') ##customising for readability
    print(f'\n{"-" * 50} \n')

Write Quit or Exit to quit


Question #1:  What is the first Amendment of the US Constitution?



Answer: The first Amendment of the US Constitution states that Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof; or abridging the freedom of speech, or of the press; or the right of the people peaceably to assemble, and to petition the Government for a redress of grievances.

-------------------------------------------------- 



Question #2:  Explain the concept of 'Federalism' as it is presented in the US Constitution



Answer: Federalism is the concept embodied in the US Constitution that establishes a division of powers between the national government and the state governments. It is the principle that there are certain powers and responsibilities that are specifically granted to the national government, while others are reserved for the individual states. This division of powers aims to strike a balance between a strong central government and the autonomy of individual states. The US Constitution establishes the federal government as a separate and distinct entity, with its own powers and authorities, while also recognizing the importance of state governments and their ability to handle local issues.

-------------------------------------------------- 



Question #3:  Describe the Bill of Rights



Answer: The Bill of Rights refers to the first ten amendments to the Constitution of the United States. These amendments were ratified on December 15, 1791. The purpose of the Bill of Rights is to establish and protect individual rights and freedoms. It includes provisions such as freedom of speech, religion, and the press (First Amendment), the right to bear arms (Second Amendment), protection against unreasonable searches and seizures (Fourth Amendment), the right to a fair trial (Sixth Amendment), and protection against cruel and unusual punishment (Eighth Amendment), among others. The Bill of Rights is a crucial component of the Constitution that ensures the protection of individual liberties.

-------------------------------------------------- 



Question #4:  Q1: how does the constitution address the issue of Presidential succession? Q2: Describe the bill of rights. Answer both questions



Answer: Q1: The Constitution addresses the issue of Presidential succession in Article II, Section 1. It establishes that in the event of the President's death, resignation, or removal from office, the Vice President will assume the role of President.

Q2: The Bill of Rights is a collection of the first ten amendments to the United States Constitution. It was added to the Constitution to protect individual rights and limit the power of the federal government. The Bill of Rights guarantees essential freedoms such as freedom of speech, religion, and the press, the right to bear arms, and the right to a fair trial. It also protects against unreasonable searches and seizures, cruel and unusual punishment, and excessive bail or fines.

-------------------------------------------------- 



Question #5:  quit


Quitting ... bye bye!


In [49]:
delete_pinecone_index() ## doing this so i can create anothe index to pick from another document source

Deleting all indexes...
Ok


In [39]:
## testing other document source like wikipedia
#data = load_from_wikipedia('ChatGPT', 'ro') ## in romania language, my native short code doesnt exist yet
#chunks = chunk_data(data)
#index_name = 'chatgpt'
#vector_store = insert_or_fetch_embeddings(index_name)

Creating index chatgpt and embeddings ...Ok


In [40]:
q = 'Ce este Chatgpt?'
answer = ask_and_get_answers(vector_store, q)
print(answer)

ChatGPT este un membru al familiei de modele de limbaj generative pre-antrenate (GPT). A fost reglat fin (o abordare a transferului de învățare) peste o versiune îmbunătățită a GPT-3 a OpenAI, cunoscută sub numele de „GPT-3.5”. Este un sistem care poate genera texte și poate fi folosit pentru a desfășura conversații cu utilizatori într-o varietate de domenii. Este utilizat în diverse scopuri, inclusiv pentru asistență și educație.


## Add Memory to App

In [64]:
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question':question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))

    return result, chat_history

In [65]:
## asking with memory
chat_history = []
question = 'How many amendments are in U.S. Constitution?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

There are 27 amendments in the U.S. Constitution.
[('How many amendments are in U.S. Constitution?', 'There are 27 amendments in the U.S. Constitution.')]


In [66]:
question = 'Multiply that number by 2?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

I'm sorry, but I don't have enough information to answer your question. Could you please provide more context or specify which number you are referring to?
[('How many amendments are in U.S. Constitution?', 'There are 27 amendments in the U.S. Constitution.'), ('Multiply that number by 2?', "I'm sorry, but I don't have enough information to answer your question. Could you please provide more context or specify which number you are referring to?")]
