In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.llms import OpenAIChat
from langchain.document_loaders import PagedPDFSplitter

In [2]:
import os, sys
sys.path.insert(1, 'D:\Github\DeepLake-Langchain')
import credentials
os.environ["OPENAI_API_KEY"] = credentials.openai
os.environ['ACTIVELOOP_TOKEN'] = credentials.active_loop

In [3]:
import requests
import tqdm
from typing import List


In [4]:
# financial reports of amamzon, but can be replaced by any URLs of pdfs
urls = ['https://s2.q4cdn.com/299287126/files/doc_financials/Q1_2018_-_8-K_Press_Release_FILED.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q2_2018_Earnings_Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q318-Amazon-Earnings-Press-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/AMAZON.COM-ANNOUNCES-FOURTH-QUARTER-SALES-UP-20-TO-$72.4-BILLION.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q119_Amazon_Earnings_Press_Release_FINAL.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q2-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q3-2019-Amazon-Financial-Results.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q4-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q4/business_and_financial_update.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

In [5]:
def load_reports(urls:List)->List[str]:
    """Load pages from a list of urls"""
    pages = []
    
    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split("/")[-1]
        with open(path,"wb") as f:
            f.write(r.content)
        loader = PagedPDFSplitter(path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)

100%|██████████| 19/19 [01:07<00:00,  3.55s/it]


In [8]:
text_splitted = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
texts = text_splitted.split_documents(pages)

embeddings = HuggingFaceEmbeddings()
dataset_path=f"hub://{credentials.active_loop_org_id}/amazon_finance"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
db.add_documents(texts)

\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/megatron17/amazon_finance



/

hub://megatron17/amazon_finance loaded successfully.



-

Deep Lake Dataset in hub://megatron17/amazon_finance already exists, loading from the storage
Dataset(path='hub://megatron17/amazon_finance', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype    shape    dtype  compression
  -------   -------  -------  -------  ------- 
 embedding  generic   (0,)    float32   None   
    ids      text     (0,)      str     None   
 metadata    json     (0,)      str     None   
   text      text     (0,)      str     None   


Evaluating ingest: 100%|██████████| 1/1 [02:10<00:00
\

Dataset(path='hub://megatron17/amazon_finance', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (414, 768)  float32   None   
    ids      text     (414, 1)     str     None   
 metadata    json     (414, 1)     str     None   
   text      text     (414, 1)     str     None   


 

['c97221dd-23c1-11ee-8a35-00d861dd19c7',
 'c97221de-23c1-11ee-b60a-00d861dd19c7',
 'c97221df-23c1-11ee-9385-00d861dd19c7',
 'c97221e0-23c1-11ee-8cdb-00d861dd19c7',
 'c97221e1-23c1-11ee-803e-00d861dd19c7',
 'c97221e2-23c1-11ee-b20c-00d861dd19c7',
 'c97221e3-23c1-11ee-ab2f-00d861dd19c7',
 'c97221e4-23c1-11ee-b152-00d861dd19c7',
 'c97221e5-23c1-11ee-adff-00d861dd19c7',
 'c97221e6-23c1-11ee-bac4-00d861dd19c7',
 'c97221e7-23c1-11ee-a339-00d861dd19c7',
 'c97221e8-23c1-11ee-9665-00d861dd19c7',
 'c97221e9-23c1-11ee-bf14-00d861dd19c7',
 'c97221ea-23c1-11ee-806c-00d861dd19c7',
 'c97221eb-23c1-11ee-b564-00d861dd19c7',
 'c97221ec-23c1-11ee-929f-00d861dd19c7',
 'c97221ed-23c1-11ee-913a-00d861dd19c7',
 'c97221ee-23c1-11ee-9efa-00d861dd19c7',
 'c97221ef-23c1-11ee-bd18-00d861dd19c7',
 'c97221f0-23c1-11ee-8d42-00d861dd19c7',
 'c97221f1-23c1-11ee-af47-00d861dd19c7',
 'c97221f2-23c1-11ee-a760-00d861dd19c7',
 'c97221f3-23c1-11ee-910d-00d861dd19c7',
 'c97221f4-23c1-11ee-9758-00d861dd19c7',
 'c97221f5-23c1-

In [9]:
qa = RetrievalQA.from_chain_type(llm=OpenAIChat(model='gpt-3.5-turbo'), chain_type='stuff', retriever=db.as_retriever())



In [14]:
qa.run("Combine total revenue in 2020?")

'The total revenue in 2020 for Amazon.com, Inc. was $332,410 million.'

In [13]:
qa.run("What is the revenue in 2021 Q3?")

"Sorry, I don't have access to the specific revenue amount for 2021 Q3."

In [12]:
qa.run("What is the revenue in 2023 Q1?")

'There is no information provided about the revenue in 2023 Q1.'