In [1]:
from dotenv import load_dotenv

load_dotenv('../../.env')


True

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA

# Load Documents

In [7]:
import requests
import tqdm
from typing import List

# financial reports of amamzon, but can be replaced by any URLs of pdfs
urls = ['https://s2.q4cdn.com/299287126/files/doc_financials/Q1_2018_-_8-K_Press_Release_FILED.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q2_2018_Earnings_Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q318-Amazon-Earnings-Press-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/AMAZON.COM-ANNOUNCES-FOURTH-QUARTER-SALES-UP-20-TO-$72.4-BILLION.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q119_Amazon_Earnings_Press_Release_FINAL.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q2-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q3-2019-Amazon-Financial-Results.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q4-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q4/business_and_financial_update.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

def load_reports(urls: List[str]) -> List[str]:
    """ Load pages from a list of urls"""
    pages = []

    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split('/')[-1]
        with open(path, 'wb') as f:
            f.write(r.content)
        loader = PyPDFLoader(path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)



100%|██████████| 19/19 [02:04<00:00,  6.56s/it]


# Use Text Splitters

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device':'cpu'} )

db = DeepLake(dataset_path="hub://thapabibek1129/amazon_earnings_6", embedding_function=embeddings)
db.add_documents(texts)


Your Deep Lake dataset has been successfully created!


Creating 414 embeddings in 1 batches of size 414:: 100%|██████████| 1/1 [01:05<00:00, 65.10s/it]

Dataset(path='hub://thapabibek1129/amazon_earnings_6', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (414, 1)     str     None   
 metadata     json      (414, 1)     str     None   
 embedding  embedding  (414, 384)  float32   None   
    id        text      (414, 1)     str     None   





['4f35890b-ca5a-11ee-8b48-a434d9523559',
 '4f35890c-ca5a-11ee-b755-a434d9523559',
 '4f35890d-ca5a-11ee-8d23-a434d9523559',
 '4f35890e-ca5a-11ee-b186-a434d9523559',
 '4f35890f-ca5a-11ee-b025-a434d9523559',
 '4f358910-ca5a-11ee-afb8-a434d9523559',
 '4f358911-ca5a-11ee-8475-a434d9523559',
 '4f358912-ca5a-11ee-af45-a434d9523559',
 '4f358913-ca5a-11ee-bc6b-a434d9523559',
 '4f358914-ca5a-11ee-a093-a434d9523559',
 '4f358915-ca5a-11ee-b205-a434d9523559',
 '4f358916-ca5a-11ee-953c-a434d9523559',
 '4f358917-ca5a-11ee-8d93-a434d9523559',
 '4f358918-ca5a-11ee-be8f-a434d9523559',
 '4f358919-ca5a-11ee-a09d-a434d9523559',
 '4f35891a-ca5a-11ee-b9b7-a434d9523559',
 '4f35891b-ca5a-11ee-86a3-a434d9523559',
 '4f35891c-ca5a-11ee-b651-a434d9523559',
 '4f35891d-ca5a-11ee-955b-a434d9523559',
 '4f35891e-ca5a-11ee-9720-a434d9523559',
 '4f35891f-ca5a-11ee-8b17-a434d9523559',
 '4f358920-ca5a-11ee-8ae3-a434d9523559',
 '4f358921-ca5a-11ee-8619-a434d9523559',
 '4f358922-ca5a-11ee-9ca7-a434d9523559',
 '4f358923-ca5a-

# Create a Retrieval Chain

In [10]:
from langchain import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-Instruct-v0.2',
    model_kwargs={'temperature':0.5,"max_length": 64,"max_new_tokens":512}
)



In [11]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=db.as_retriever())

In [13]:
qa("Combine total revenue in 2020?")

{'query': 'Combine total revenue in 2020?',
 'result': ' The total revenue for Amazon.com Inc. in 2020 was $221,598 million. This can be calculated by adding the net sales for the North America, International, and AWS segments for the full year 2020: $55,436 million + $83,935 million + $84,227 million = $223,604 million. However, the provided financial statements show a total net sales figure of $214,606 million for the year 2020. The discrepancy might be due to rounding or the exclusion of certain items in the given figures. Therefore, the closest approximation from the provided data is $214,606 million.'}

In [15]:
print(qa("What is the revenue in 2021 Q3?"))

{'query': 'What is the revenue in 2021 Q3?', 'result': ' The revenue in 2021 Q3 was $82,360 million.\nExplanation: The revenue for 2021 Q3 can be found in the "North America Segment" table under the "Net sales" column for Q3 2021. The number is $82,360 million.'}


In [16]:
qa("What is the revenue in 2023 Q1?")

{'query': 'What is the revenue in 2023 Q1?',
 'result': ' Based on the information provided in the text, the revenue for Amazon.com in 2023 Q1 is expected to be between $121.0 billion and $126.0 billion.\n\nExplanation: The text states that net sales for Amazon.com in 2023 Q1 are expected to be between $121.0 billion and $126.0 billion. This range represents the expected revenue for the company during this quarter.'}