In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

#### Function that loads a document

In [3]:
def load_document(file):
    from langchain.document_loaders import PyPDFLoader
    print(f'Loading {file}...')
    loader = PyPDFLoader(file)
    data = loader.load()
    return data

#### Load a pdf file data

In [6]:
pip install pypdf

Collecting pypdf
  Obtaining dependency information for pypdf from https://files.pythonhosted.org/packages/74/a9/5ccde1312650dd03e65350224fea85d9a430c182a01f056599cbb76f7390/pypdf-3.17.0-py3-none-any.whl.metadata
  Using cached pypdf-3.17.0-py3-none-any.whl.metadata (7.5 kB)
Using cached pypdf-3.17.0-py3-none-any.whl (277 kB)
Installing collected packages: pypdf
Successfully installed pypdf-3.17.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
data = load_document('./files/us_constitution.pdf')
print(data[0].page_content)

Loading ./files/us_constitution.pdf...
The
United
States
Constitution
W e
the
People
of
the
United
States,
in
Order
to
form
a
more
perfect
Union,
establish
Justice,
insure
domestic
T ranquility ,
provide
for
the
common
defence,
promote
the
general
W elfare,
and
secure
the
Blessings
of
Liberty
to
ourselves
and
our
Posterity ,
do
ordain
and
establish
this
Constitution
for
the
United
States
of
America.
The
Constitutional
Con v ention
Article
I
Section
1:
Congress
All
legislative
Powers
herein
granted
shall
be
vested
in
a
Congress
of
the
United
States,
which
shall
consist
of
a
Senate
and
House
of
Representatives.
Section
2:
The
House
of
Representatives


In [8]:
print(data[0].metadata)

{'source': './files/us_constitution.pdf', 'page': 0}


In [9]:
print(f'You have {len(data)} pages in your pdf')

You have 41 pages in your pdf


#### Improve load document function to also load a docx file

In [13]:
pip install docx2txt

Collecting docx2txt
  Using cached docx2txt-0.8-py3-none-any.whl
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)
    print(f'Loading {file}...')
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported')
        return None
    
    data = loader.load()
    return data

In [16]:
data = load_document('files/the_great_gatsby.docx')
print(data[0].metadata)

Loading files/the_great_gatsby.docx...
{'source': 'files/the_great_gatsby.docx'}


In [15]:
print(len(data))

1


### Loading from Wikipedia

In [18]:
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [20]:
pip install wikipedia

Collecting wikipedia
  Using cached wikipedia-1.4.0-py3-none-any.whl
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [65]:
data = load_from_wikipedia('GPT-4')

In [66]:
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was initially released on March 14, 2023, and has been made publicly available via the paid chatbot product ChatGPT Plus, and via OpenAI's API.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4 is also capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technical details and statistics about GPT-4, such as the precise siz

In [67]:
data = load_from_wikipedia('GPT-4')
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its series of GPT foundation models. It was initially released on March 14, 2023, and has been made publicly available via the paid chatbot product ChatGPT Plus, and via OpenAI's API.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4 is also capable of taking images as input on ChatGPT. OpenAI has declined to reveal various technical details and statistics about GPT-4, such as the precise siz

#### Chunking

In [28]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = splitter.split_documents(data)
    return chunks

In [68]:
chunks = chunk_data(data)
print(len(chunks))

39


#### Calculating embedding cost

In [30]:
pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [57]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

In [69]:
print_embedding_cost(chunks)

Total tokens: 1733
Embedding Cost in USD: 0.000693


### Embedding and uploading to Pinecone

In [59]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

embeddings = OpenAIEmbeddings()

  from tqdm.autonotebook import tqdm


#### Insert or fetch index

In [77]:
import os
def insert_or_fetch_index(index_name):
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    if index_name in pinecone.list_indexes():
        print('Index already exists. Fetching...')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Done')
    else:
        print(f'Creating index: {index_name}')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Done')
    return vector_store

#### Delete an index

In [62]:
def delete_index(index_name='all'):
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    if index_name == 'all':
        for index in pinecone.list_indexes():
            print(f'Deleting {index}...')
            pinecone.delete_index(index)
            print('Deleted')
    else:
        print(f'Deleting {index_name}...')
        pinecone.delete_index(index_name)
        print('Deleted')

In [79]:
delete_index()

Deleting gpt4...
Deleted


In [80]:
index_name = 'gpt4'
vector_store = insert_or_fetch_index(index_name)

Creating index: gpt4
Done


#### Ask and get an answer

In [86]:
def ask_and_get_answer(vector_store, question):
    from langchain.chat_models import ChatOpenAI
    from langchain.chains import RetrievalQA

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(seatch_type='similarity', search_kwargs={'k': 3}) 
    
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
    
    answer = chain.run(question)
    return answer

In [87]:
answer = ask_and_get_answer(vector_store, "What this document is about?")

In [88]:
print(answer)

The document provides information about background, training, and capabilities.


#### Asking in a loop

In [91]:
import time

i = 1
print('Write Quit or Exit to end the conversation')
while True:
    question = input(f'Questions #{i}:')
    i += 1
    if question.lower() in ['quit', 'exit']:
        print('Goodbye')
        time.sleep(2)
        break
    else:
        answer = ask_and_get_answer(vector_store, question)
        print(answer)
        print('-' * 50, end='\n')

Write Quit or Exit to end the conversation


Questions #1: Summarize the document in under 20 words.


GPT-4 is an AI system that assists in coding tasks and has potential for error.
--------------------------------------------------


Questions #2: Who created GPT-4?


OpenAI created GPT-4.
--------------------------------------------------


Questions #3: What is the source of this document?


I'm sorry, but I don't have access to that information.
--------------------------------------------------


Questions #4: quit


Goodbye


#### Ask with memory

In [96]:
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chat_models import ChatOpenAI
    from langchain.chains import ConversationalRetrievalChain

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature = 1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))

    return result, chat_history

In [97]:
chat_history = []
q1 = 'How many lines are in the document?'
result, chat_history = ask_with_memory(vector_store, q1, chat_history)
print(f'answer: {result["answer"]}')

answer: The given context does not provide any information about the number of lines in the document.


In [98]:
q1 = 'Which year did they publish the document?'
result, chat_history = ask_with_memory(vector_store, q1, chat_history)
print(f'answer: {result["answer"]}')

answer: The document titled "Improving Language Understanding by Generative Pre-Training" was published in 2018.


In [99]:
q1 = 'Add two more years to that?'
result, chat_history = ask_with_memory(vector_store, q1, chat_history)
print(f'answer: {result["answer"]}')

answer: Yes, if we add two years to the publication year of "Improving Language Understanding by Generative Pre-Training," it would be 2020.
