# Workflow

In [None]:
"""
The workflow to explore Amazon's revenue growth using LangChain and Deep Lake involves:

1. Install required libraries and set API tokens for OpenAI and Activeloop.
2. Download Amazon's financial reports, and split them into smaller chunks using LangChain's Text Splitter utility.
3. Create a DeepLake vector store, add the text chunks, and use OpenAIChat as the language model.
4. Finally, query the model with questions to obtain relevant insights.
"""

# Importing necessary library

In [1]:
from langchain.embeddings import GooglePalmEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PagedPDFSplitter
from langchain_google_genai import ChatGoogleGenerativeAI



# Downloading the financial report of Amazon

In [7]:
import requests
from typing import List

def load_reports(urls):
    """Download the pages from the list of urls"""
    pages = []
    for url in urls:
        response = requests.get(url)
        path = url.split('/')[-1]
        
        with open(path, 'wb') as f:
            f.write(response.content)
        
        loader = PagedPDFSplitter(file_path=path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)

    return pages

    

urls =[
        'https://s2.q4cdn.com/299287126/files/doc_financials/Q119_Amazon_Earnings_Press_Release_FINAL.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q2-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Q3-2019-Amazon-Financial-Results.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_news/archive/Amazon-Q4-2019-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q4/business_and_financial_update.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

pages = load_reports(urls)

# Spliting the documents and storing the embeddings

In [26]:
def split_docs(pages):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(pages)
    return texts

def create_database(dataset_path):
    embedding = GooglePalmEmbeddings()
    db = DeepLake(dataset_path=dataset_path, embedding= embedding)
    return db

def store_docs(db, texts):
    db.add_documents(texts)
    return db

texts = split_docs(pages)
db = create_database(dataset_path='hub://samman/amazon_earnings_6')
db = store_docs(db, texts)

Deep Lake Dataset in hub://samman/amazon_earnings_6 already exists, loading from the storage




# Quering the database

In [23]:
def query_database(db, query):
    retriever = db.as_retriever()
    llm = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0, convert_system_message_to_human=True)

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type='stuff',
        retriever = retriever
    )

    response = qa.run(query)
    return response

query = "Combine total revenue in 2020?"
response = query_database(db, query)

Deep Lake Dataset in hub://samman/amazon_earnings_6 already exists, loading from the storage


In [24]:
response

'I cannot find the answer to your question in the context provided.'

In [25]:
query = "how much skills alexa offers in the Al exa Skills Store from external developers"
response = query_database(db, query)
print(response)

I cannot find the answer to your question in the context provided.
