### Build ChatGPT to Answer Questions on Your Financial Data

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI 
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PagedPDFSplitter

In [2]:
import requests
import tqdm
from typing import List

# financial reports of amamzon, but can be replaced by any URLs of pdfs
urls = ['https://s2.q4cdn.com/299287126/files/doc_financials/Q1_2018_-_8-K_Press_Release_FILED.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q2_2018_Earnings_Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q318-Amazon-Earnings-Press-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/AMAZON.COM-ANNOUNCES-FOURTH-QUARTER-SALES-UP-20-TO-$72.4-BILLION.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q119_Amazon_Earnings_Press_Release_FINAL.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q2-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q3-2019-Amazon-Financial-Results.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q4-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q4/business_and_financial_update.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

def load_reports(urls: List[str]) -> List[str]:
    """ Load pages from a list of urls"""
    pages = []

    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split('/')[-1]
        with open(path, 'wb') as f:
            f.write(r.content)
        loader = PagedPDFSplitter(path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)

100%|██████████| 19/19 [01:20<00:00,  4.24s/it]


In [3]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

username = "mbilalshahid" # replace with your username from app.activeloop.ai
db = DeepLake(dataset_path=f"hub://{username}/amazon_earnings", embedding_function=embeddings)
db.add_documents(texts)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


-

Dataset(path='hub://mbilalshahid/amazon_earnings', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (414, 1536)  float32   None   
    id        text      (414, 1)      str     None   
 metadata     json      (414, 1)      str     None   
   text       text      (414, 1)      str     None   


 

['91728210-5564-11ee-b60b-d83bbf21f498',
 '91728211-5564-11ee-98a7-d83bbf21f498',
 '91728212-5564-11ee-b082-d83bbf21f498',
 '91728213-5564-11ee-b8f9-d83bbf21f498',
 '91729bd3-5564-11ee-9aea-d83bbf21f498',
 '91729bd4-5564-11ee-8297-d83bbf21f498',
 '91729bd5-5564-11ee-9092-d83bbf21f498',
 '91729bd6-5564-11ee-bdca-d83bbf21f498',
 '91729bd7-5564-11ee-a0b8-d83bbf21f498',
 '9172a991-5564-11ee-9f19-d83bbf21f498',
 '9172a992-5564-11ee-99ea-d83bbf21f498',
 '9172a993-5564-11ee-bac4-d83bbf21f498',
 '9172a994-5564-11ee-b229-d83bbf21f498',
 '9172a995-5564-11ee-b51c-d83bbf21f498',
 '9172a996-5564-11ee-9369-d83bbf21f498',
 '9172a997-5564-11ee-bdf0-d83bbf21f498',
 '9172a998-5564-11ee-9e80-d83bbf21f498',
 '9172a999-5564-11ee-96ea-d83bbf21f498',
 '9172a99a-5564-11ee-a556-d83bbf21f498',
 '9172a99b-5564-11ee-b910-d83bbf21f498',
 '9172a99c-5564-11ee-ba1e-d83bbf21f498',
 '9172a99d-5564-11ee-8dfa-d83bbf21f498',
 '9172a99e-5564-11ee-a04c-d83bbf21f498',
 '9172a99f-5564-11ee-88e9-d83bbf21f498',
 '9172a9a0-5564-

In [7]:
model = ChatOpenAI(model='gpt-3.5-turbo')
retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(llm=model, chain_type='stuff', retriever=retriever)

In [8]:
qa.run("Combine total revenue in 2020?")

'To calculate the total revenue for 2020, you need to add up the net product sales and net service sales for each quarter. \n\nFrom the given information, the net product sales for each quarter in 2020 are as follows:\n\nQ1: $41,841 million\nQ2: $50,244 million\nQ3: $52,774 million\nQ4: $71,056 million\n\nThe net service sales for each quarter in 2020 are as follows:\n\nQ1: $33,611 million\nQ2: $38,668 million\nQ3: $43,371 million\nQ4: $54,499 million\n\nTo calculate the total net sales for 2020, you need to add up the net product sales and net service sales for each quarter:\n\nTotal net sales in 2020 = (Q1 net product sales + Q2 net product sales + Q3 net product sales + Q4 net product sales) + (Q1 net service sales + Q2 net service sales + Q3 net service sales + Q4 net service sales)\n\nTotal net sales in 2020 = ($41,841 + $50,244 + $52,774 + $71,056) + ($33,611 + $38,668 + $43,371 + $54,499)\n\nTotal net sales in 2020 = $215,915 + $170,149\n\nTotal net sales in 2020 = $386,064 mill

In [9]:
qa.run("What is the revenue in 2021 Q3?")

'The revenue in 2021 Q3 was $110.8 billion.'

In [10]:
qa.run("What is the revenue in 2023 Q1?")

'The revenue for the first quarter of 2023 is expected to be between $121.0 billion and $126.0 billion.'