In [2]:
!pip install langchain
!pip install langchain_community
# !pip install tiktoken
# !pip install faiss-cpu
# !pip install chromadb
!pip install edgartools
!pip install langchain-text-splitters python-dotenv
!pip install langchain-google-genai
!pip install -q transformers sentence-transformers qdrant-client langchain
# !pip install pandas numpy pathlib

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install edgartools



In [4]:
import os
import pandas as pd
from edgar import set_identity,Company
import numpy as np
import json
from pathlib import Path

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [27]:
os.environ["GOOGLE_API_KEY"] = "Your_API_Key"

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

## Data Collection

In [7]:
set_identity("K sha kt@codes.finance")

CIK_MAP = {
    "GOOGL": "GOOGL",
    "MSFT": "MSFT",
    "NVDA": "NVDA"
}

YEARS = [2022, 2023, 2024]
SAVE_DIR = "sec_filings"
os.makedirs(SAVE_DIR, exist_ok=True)

In [8]:
def download_10k_for_company(ticker):
    print(f"\nProcessing: {ticker}")
    company = Company(ticker)
    filings = company.get_filings(form="10-K")

    df = filings.to_pandas()
    df['filing_date'] = pd.to_datetime(df['filing_date'])

    for year in YEARS:
        match = df[df['filing_date'].dt.year == year]
        if match.empty:
            print(f"No 10-K filing found for {ticker} in {year}")
            continue

        filing_date = match.iloc[0]['filing_date']
        filing = filings.filter(date=filing_date.strftime('%Y-%m-%d'))
        filing_obj = filing.latest().obj()

        print(f"Downloading {ticker} 10-K for {year} (filed on {filing_date.date()})...")

        try:
            content = filing_obj.items
            filename = f"{ticker}_{year}_10K.txt"
            filepath = os.path.join(SAVE_DIR, filename)

            with open(filepath, 'a', encoding='utf-8') as f:
                for i in content:
                    x = filing_obj[i]
                    f.write(x)

            print(f"Saved to {filepath}")
        except Exception as e:
            print(f"Error saving {ticker} {year}: {e}")

if __name__ == "__main__":
    for ticker in CIK_MAP.values():
        download_10k_for_company(ticker)


Processing: GOOGL
Downloading GOOGL 10-K for 2022 (filed on 2022-02-02)...
Saved to sec_filings/GOOGL_2022_10K.txt
Downloading GOOGL 10-K for 2023 (filed on 2023-02-03)...
Saved to sec_filings/GOOGL_2023_10K.txt
Downloading GOOGL 10-K for 2024 (filed on 2024-01-31)...
Saved to sec_filings/GOOGL_2024_10K.txt

Processing: MSFT
Downloading MSFT 10-K for 2022 (filed on 2022-07-28)...
Saved to sec_filings/MSFT_2022_10K.txt
Downloading MSFT 10-K for 2023 (filed on 2023-07-27)...
Saved to sec_filings/MSFT_2023_10K.txt
Downloading MSFT 10-K for 2024 (filed on 2024-07-30)...
Saved to sec_filings/MSFT_2024_10K.txt

Processing: NVDA
Downloading NVDA 10-K for 2022 (filed on 2022-03-18)...
Saved to sec_filings/NVDA_2022_10K.txt
Downloading NVDA 10-K for 2023 (filed on 2023-02-24)...
Saved to sec_filings/NVDA_2023_10K.txt
Downloading NVDA 10-K for 2024 (filed on 2024-02-21)...
Saved to sec_filings/NVDA_2024_10K.txt


## Chunking , Embedding and Storing

In [9]:
from typing import List, Optional

In [10]:
class VectorDatabaseIngestion:
  def __init__(self,
               data_directory: str = "sec_filings/",
               qdrant_url: str = ":memory:",
               collection_name: str = "sec_filings_collection",
               embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
               chunk_size: int = 1000,
               chunk_overlap: int = 200):
    self.data_directory = Path(data_directory)
    self.qdrant_url = qdrant_url
    self.collection_name = collection_name
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap

    try:
      self.embedding = SentenceTransformerEmbeddings(model_name=embedding_model)
    except Exception as e:
      print(f"Error loading embedding model: {e}")

  def load_documents(self) -> List[Document]:
    """Loads documents from the data directory."""
    try:
      loader = DirectoryLoader(str(self.data_directory),
                              glob="*.txt",
                              loader_cls=TextLoader,
                              show_progress=True)

      documents = loader.load()
      return documents
    except:
      print(f"Error loading documents from {self.data_directory}")
      return []

  def split_documents(self,documents : List[Document]) -> List[Document]:
    """Splits documents into chunks."""

    text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
    chunks = text_splitter.split_documents(documents)
    print(len(chunks))
    return chunks

  def ingest_documents(self):
    documents = self.load_documents()
    if not documents:
      return

    chunks = self.split_documents(documents)
    if not chunks:
      return

    try:
      qdrant = Qdrant.from_documents(
        chunks,
        self.embedding,
        location=self.qdrant_url,
        collection_name=self.collection_name
      )
      self.qdrant_db = qdrant
      return qdrant
    except Exception as e:
      print(f"Error ingesting documents: {e}")


  def search_similar_chunks(self,query: str,top_k: int = 5):
    if not self.qdrant_db:
      print("Qdrant database not initialized. Please call ingest_document first.")
      return []

    try:
      results = self.qdrant_db.similarity_search(query, k=top_k)
      return results
    except Exception as e:
      print(f"Error searching similar chunks: {e}")
      return []



In [11]:
ingester = VectorDatabaseIngestion(
      data_directory=SAVE_DIR,
      qdrant_url=":memory:",
      collection_name="sec_filings_vector_db",
      embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)
qdrant_db = ingester.ingest_documents()


  self.embedding = SentenceTransformerEmbeddings(model_name=embedding_model)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 9/9 [00:00<00:00, 1190.06it/s]


1201


## Testing Sample Query and Functions

In [165]:
query = "What was NVIDIA operating margin in 2023?"
docs = qdrant_db.similarity_search(query)
print("\nSearch Results:\n")
for doc in docs:
    print(doc.page_content[:1000] + "...")
    print('\n'+100*'~'+'\n')


Search Results:

Year Ended
                                        January 29,                January 30,                January 31,
                                               2023                       2022                       2021
Revenue:                                                                                    (In millions)
United States                          $8,292                     $4,349                     $3,214      
Taiwan                                  6,986                      8,544                      4,531      
China (including Hong Kong)             5,785                      7,111                      3,886      
Other countries                         5,911                      6,910                      5,044      
Total revenue                         $26,974                    $26,914                    $16,675      
No customer represented 10% or more of total revenue for fiscal years 2023, 2022 and 2021.
NVIDIA CORPORATION AND SUBSIDIAR

In [116]:
context = "\n\n".join([doc.page_content for doc in docs])

In [117]:
context

"Year Ended\n                                        January 29,                January 30,                January 31,\n                                               2023                       2022                       2021\nRevenue:                                                                                    (In millions)\nUnited States                          $8,292                     $4,349                     $3,214      \nTaiwan                                  6,986                      8,544                      4,531      \nChina (including Hong Kong)             5,785                      7,111                      3,886      \nOther countries                         5,911                      6,910                      5,044      \nTotal revenue                         $26,974                    $26,914                    $16,675      \nNo customer represented 10% or more of total revenue for fiscal years 2023, 2022 and 2021.\nNVIDIA CORPORATION AND SUBSIDIARIES\nNO

In [113]:
Companies = {"GOOGLE":"GOOGL","MICROSOFT":"MSFT","NVIDIA":"NVDA"}
Years = [2022,2023,2024]

In [166]:
prompt = PromptTemplate(
    input_variable = ["context","query","companies","years"],
    template = """ You are a helpful assistant. Use ONLY the following pieces of context provided to answer the question at the end.
    The context might have some data in tabular format so parse and understand it accordingly and answer the question.
    For complex question Like comparsion between companies for revenue/total revenue/margin/operating marging/gross margin/profit earned try to using decomposed the questions provided and then answer based on the context.
    For Simple question You can directly answer the question based on the context.
    If spending/operating margin/gross margin/profit/operating profit/total revenue are not directly given you can calculate them based on the context.
    If the Question ask for revenue growth/growth also provide the percentage growth by calculating it.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Data which we have :

    {context}

    {companies}
    {years}

    Question: {query}
    """
)

In [118]:
LMC = LLMChain(llm=llm,prompt=prompt)
final_a = LMC.run({"context":context,"query":query,"companies":Companies,"years":Years})
print(final_a)

To determine NVIDIA's operating margin in 2023, I will use the provided data.

Operating Income in 2023: $4,224 million
Revenue in 2023: $26,974 million

Operating Margin = (Operating Income / Revenue) * 100
Operating Margin = ($4,224 / $26,974) * 100 = 15.66%

Answer: NVIDIA's operating margin in 2023 was 15.66%.


### Checking Multi Query Reriever

In [40]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [41]:
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=qdrant_db.as_retriever(search_kwargs={"k": 5}),
    llm=llm
)

In [42]:
multiquery_results= multiquery_retriever.invoke(query)

In [43]:
multiquery_results

[Document(metadata={'source': 'sec_filings/MSFT_2022_10K.txt', '_id': '482582719fcf4575b6afd087c0078ec4', '_collection_name': 'sec_filings_vector_db'}, page_content='PART I\nItem 1\nOPERATING SEGMENTS\n We operate our business and report our financial performance using three segments: Productivity and Business Processes, Intelligent Cloud, and More Personal Computing. Our segments provide management with a comprehensive financial view of our key businesses. The segments enable the alignment of strategies and objectives across the development, sales, marketing, and services organizations, and they provide a framework for timely and rational allocation of resources within businesses.\n Additional information on our operating segments and geographic and product information is contained in Note 19 – Segment Information and Geographic Data of the Notes to Financial Statements (Part II, Item 8 of this Form 10-K).\n Our reportable segments are described below.\n Productivity and Business Proc

In [44]:
context = "\n\n".join([doc.page_content for doc in multiquery_results])

In [45]:
LMC = LLMChain(llm=llm,prompt=prompt)
final_a = LMC.run({"context":context,"query":query})
print(final_a)

I am sorry, but the context provided does not contain sufficient information to compare the operating margins of different companies in 2023. The text discusses Microsoft's operating income, gross margin, and revenue, but it does not provide the operating margin directly or the necessary data to calculate it for comparison with other companies. There is also information about Alphabet Inc. but not in relation to operating margin.


### Checking by Decomposing Query

In [50]:
Companies = {"GOOGLE":"GOOGL","MICROSOFT":"MSFT","NVIDIA":"NVDA"}
Years = [2022,2023,2024]

In [156]:
query = "Compare cloud revenue growth rates across all three companies from 2022 to 2023"

In [66]:

decompose_prompt = PromptTemplate(
    input_variables=["companies","years","query"],
    template = """You are a Helpfull assistant. Use ONLY the following pieces of context provided to answer the question at the end.
    I want to decompose this Question/query into multiple simpler and logically ordered sub-queries
    where each Question/query is decomposed on the basis of the question type, company, and year provide to us like this :
    1)Simple Direct Query : "What was Microsoft’s total revenue in 2023?" - For this type of query we don't need to decompose the question as it is already in the best format,So return the question as it is.
    2)Comparative Query : “How did NVIDIA’s data center revenue grow from 2022 to 2023?”
    - it should be broken into Find NVIDIA data center revenue 2022,Find NVIDIA data center revenue 2023,Calculate growth
    3)Cross-Company Analysis : “Which company had the highest operating margin in 2023?”
    - Retrieve MSFT operating margin 2023 , Retrieve GOOGL operating margin 2023,Retrieve NVDA operating margin 2023,Compare and determine highest

    Provide ONLY sub-queries in the above format and place each sub-query into the new line and not in a single line. Not any other text
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {companies}
    {years}
    Question :{query}

    """
)


In [157]:
DLMC = LLMChain(llm=llm,prompt=decompose_prompt)
de_a = DLMC.run({"companies":Companies,"years":Years,"query":query})
print(de_a)

Retrieve MSFT cloud revenue 2022
Retrieve MSFT cloud revenue 2023
Calculate MSFT cloud revenue growth from 2022 to 2023
Retrieve GOOGL cloud revenue 2022
Retrieve GOOGL cloud revenue 2023
Calculate GOOGL cloud revenue growth from 2022 to 2023
Retrieve NVDA cloud revenue 2022
Retrieve NVDA cloud revenue 2023
Calculate NVDA cloud revenue growth from 2022 to 2023
Compare cloud revenue growth rates across MSFT, GOOGL, and NVDA


In [158]:
sub_queries = de_a.strip().split('\n')
sub_queries

['Retrieve MSFT cloud revenue 2022',
 'Retrieve MSFT cloud revenue 2023',
 'Calculate MSFT cloud revenue growth from 2022 to 2023',
 'Retrieve GOOGL cloud revenue 2022',
 'Retrieve GOOGL cloud revenue 2023',
 'Calculate GOOGL cloud revenue growth from 2022 to 2023',
 'Retrieve NVDA cloud revenue 2022',
 'Retrieve NVDA cloud revenue 2023',
 'Calculate NVDA cloud revenue growth from 2022 to 2023',
 'Compare cloud revenue growth rates across MSFT, GOOGL, and NVDA']

In [159]:
# Multi step retrieval
all_docs = []
for sub_query in sub_queries:
  sub_docs = multiquery_retriever.invoke(sub_query)
  all_docs.extend(sub_docs)

In [146]:
all_docs

[Document(metadata={'source': 'sec_filings/MSFT_2023_10K.txt', '_id': 'a50eb4cef82d4ef3b4c1925b210d0b43', '_collection_name': 'sec_filings_vector_db'}, page_content='(In millions)                                                                                       \nYear Ended June 30,                  2023                2022                2021             \nServer products and cloud services             $79,970             $67,350             $52,589\nOffice products and cloud services              48,728              44,862              39,872\nWindows                                         21,507              24,732              22,488\nGaming                                          15,466              16,230              15,370\nLinkedIn                                        15,145              13,816              10,289\nSearch and news advertising                     12,208              11,591               9,267\nEnterprise Services                              7,722      

In [161]:
context = "\n\n".join([doc.page_content for doc in all_docs])

In [148]:
context

"(In millions)                                                                                       \nYear Ended June 30,                  2023                2022                2021             \nServer products and cloud services             $79,970             $67,350             $52,589\nOffice products and cloud services              48,728              44,862              39,872\nWindows                                         21,507              24,732              22,488\nGaming                                          15,466              16,230              15,370\nLinkedIn                                        15,145              13,816              10,289\nSearch and news advertising                     12,208              11,591               9,267\nEnterprise Services                              7,722               7,407               6,943\nDevices                                          5,521               7,306               7,143\nDynamics                         

In [131]:
query = "Compare cloud revenue growth rates across all three companies from 2022 to 2023"

In [164]:
LMC = LLMChain(llm=llm,prompt=prompt)
final_a = LMC.run({"context":context,"query":query,"companies":Companies,"years":Years})
print(final_a)

Okay, let's break this down to compare cloud revenue growth rates.

**1. Microsoft Cloud Revenue Growth:**

*   Microsoft Cloud revenue in 2022: Not directly available, but Microsoft Cloud revenue increased 22% to $111.6 billion in fiscal year 2023.
*   Microsoft Cloud revenue in 2023: Microsoft Cloud revenue increased 23% to $137.4 billion in fiscal year 2024.
*   So, we can calculate the cloud revenue for 2022 using the 2023 growth number: 111.6/1.22 = $91.47 billion
*   Microsoft Cloud revenue growth rate from 2022 to 2023 = (111.6-91.47)/91.47 = 21.9%

**2. Google Cloud Revenue Growth:**

*   Google Cloud revenue in 2022: $26,280 million
*   Google Cloud revenue in 2023: $33,088 million
*   Google Cloud revenue growth rate from 2022 to 2023 = (33088-26280)/26280 = 25.9%

**Comparison:**

*   Microsoft Cloud revenue growth rate (2022 to 2023): 21.9%
*   Google Cloud revenue growth rate (2022 to 2023): 25.9%

**Answer:** Google Cloud revenue grew faster (25.9%) than Microsoft Cloud r

## Agent

In [15]:
Companies = {"GOOGLE":"GOOGL","MICROSOFT":"MSFT","NVIDIA":"NVDA"}
Years = [2022,2023,2024]

In [19]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [20]:
class Agent:
  def __init__(self,qdrant_db):
    self.qdrant_db = qdrant_db

  def decompose_query(self,query):
    decompose_prompt = PromptTemplate(
    input_variables=["companies","years","query"],
    template = """You are a Helpfull assistant. Use ONLY the following pieces of context provided to answer the question at the end.
    I want to decompose this Question/query into multiple simpler and logically ordered sub-queries
    where each Question/query is decomposed on the basis of the question type, company, and year provide to us like this :
    1)Simple Direct Query : "What was Microsoft’s total revenue in 2023?" - For this type of query we don't need to decompose the question as it is already in the best format,So return the question as it is.
    2)Comparative Query : “How did NVIDIA’s data center revenue grow from 2022 to 2023?”
    - it should be broken into Find NVIDIA data center revenue 2022,Find NVIDIA data center revenue 2023,Calculate growth
    3)Cross-Company Analysis : “Which company had the highest operating margin in 2023?”
    - Retrieve MSFT operating margin 2023 , Retrieve GOOGL operating margin 2023,Retrieve NVDA operating margin 2023,Compare and determine highest

    Provide ONLY sub-queries in the above format and place each sub-query into the new line and not in a single line. Not any other text
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {companies}
    {years}
    Question :{query}

    """)

    DLMC = LLMChain(llm=llm,prompt=decompose_prompt)
    de_a = DLMC.run({"companies":Companies,"years":Years,"query":query})

    sub_queries = de_a.strip().split('\n')
    if not sub_queries:
      sub_queries = [query]
    print(sub_queries)
    return sub_queries

  def multistep_retrieval(self,sub_queries):
    multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=qdrant_db.as_retriever(search_kwargs={"k": 5}),
    llm=llm)

    all_docs = []
    for sub_query in sub_queries:
      sub_docs = multiquery_retriever.invoke(sub_query)
      all_docs.extend(sub_docs)

    context = "\n\n".join([doc.page_content for doc in all_docs])
    return context

  def synth_result(self,context,query):
    prompt = PromptTemplate(
    input_variable = ["context","query","companies","years"],
    template = """ You are a helpful assistant. Use ONLY the following pieces of context provided to answer the question at the end.
    The context might have some data in tabular format so parse and understand it accordingly and answer the question.
    For complex question Like comparsion between companies for revenue/total revenue/margin/operating marging/gross margin/profit earned try to using decomposed the questions provided and then answer based on the context.
    For Simple question You can directly answer the question based on the context.
    If spending/operating margin/gross margin/profit/operating profit/total revenue are not directly given you can calculate them based on the context.
    If the Question ask for revenue growth/growth also provide the percentage growth by calculating it.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Data which we have :

    {context}

    {companies}
    {years}

    Question: {query}
    """)

    LMC = LLMChain(llm=llm,prompt=prompt)
    final_a = LMC.run({"context":context,"query":query,"companies":Companies,"years":Years})
    return final_a

  def pipeline(self,query):
    sub_queries = self.decompose_query(query)
    context = self.multistep_retrieval(sub_queries)
    result = self.synth_result(context,query)
    return result


In [21]:
agent = Agent(qdrant_db=qdrant_db)

## Testing On Sample Queries

In [22]:
query = "How did NVIDIA’s data center revenue grow from 2022 to 2023?"
result = agent.pipeline(query)

print(str(result))

['Find NVIDIA data center revenue 2022', 'Find NVIDIA data center revenue 2023', 'Calculate growth']
NVIDIA's data center revenue increased from $10,613 million in 2022 to $15,005 million in 2023, a growth of 41%.


In [23]:
query = "What was NVIDIA's total revenue in fiscal year 2024?"
result = agent.pipeline(query)

print(str(result))

['What was NVIDIA’s total revenue in 2024?']
NVIDIA's total revenue in fiscal year 2024 was $60,922 million.


In [24]:
query = "What percentage of Google's 2023 revenue came from advertising?"
result = agent.pipeline(query)

print(str(result))

['Find Google total revenue 2023', 'Find Google advertising revenue 2023', 'Calculate percentage']
Google's 2023 advertising revenue was $237,855 million, and Google's total revenue for the same year was $307,394 million.
To find the percentage of Google's 2023 revenue that came from advertising, we can use the following formula:
Percentage = (Advertising Revenue / Total Revenue) * 100
Percentage = (237,855 / 307,394) * 100 = 77.38%
So, approximately 77.38% of Google's 2023 revenue came from advertising.


In [25]:
query = "How much did Microsoft's cloud revenue grow from 2022 to 2023?"
result = agent.pipeline(query)

print(str(result))

['Find Microsoft cloud revenue 2022', 'Find Microsoft cloud revenue 2023', 'Calculate growth']
Microsoft's cloud revenue grew from $91.4 billion in fiscal year 2022 to $111.6 billion in fiscal year 2023. The growth is $20.2 billion which is 22.1% growth.


In [28]:
query = "Which of the three companies had the highest gross margin in 2023?"
result = agent.pipeline(query)

print(str(result))

['Retrieve MSFT gross margin 2023', 'Retrieve GOOGL gross margin 2023', 'Retrieve NVDA gross margin 2023', 'Compare and determine highest']
I will calculate the gross margin for each of the three companies for the year 2023 and then compare them to identify the company with the highest gross margin.

*   **Microsoft:** Gross margin for 2023 is $171,008 million.
*   **Alphabet (Google):**
    To determine Google's gross margin, we need to calculate it using the provided data.
    Gross Margin = Total Revenue - Cost of Revenue
    For 2023:
    Total Revenue = $307,394 million
    Cost of Revenue = $133,332 million
    Gross Margin = $307,394 - $133,332 = $174,062 million
*   **NVIDIA:**
    To determine NVIDIA's gross margin, we need to calculate it using the provided data.
    Gross Margin = Revenue \* Gross Margin Percentage
    For 2023:
    Total Revenue = $26,974 million
    Gross Margin Percentage = 56.9%
    Gross Margin = $26,974 \* 0.569 = $15,349.21 million

Comparing the gros

In [29]:
query = "Which company had the highest operating margin in 2023?"
result = agent.pipeline(query)

print(str(result))

['Retrieve MSFT operating margin 2023', 'Retrieve GOOGL operating margin 2023', 'Retrieve NVDA operating margin 2023', 'Compare and determine highest']
First. let's find the operating margin for each company in 2023.

For Microsoft (MSFT):
Revenue in 2023: $211,915 million
Operating income in 2023: $88,523 million
Operating margin in 2023: ($88,523 / $211,915) * 100 = 41.77%

For NVIDIA (NVDA):
Revenue in 2023: $26,974 million
Operating income in 2023: $4,224 million
Operating margin in 2023: ($4,224 / $26,974) * 100 = 15.66%

For Alphabet (GOOGLE):
Total revenues in 2023: $307,394 million
Total income from operations in 2023: $84,293 million
Operating margin in 2023: ($84,293 / $307,394) * 100 = 27.42%

Therefore, Microsoft had the highest operating margin in 2023.


In [30]:
query = "Compare the R&D spending as a percentage of revenue across all three companies in 2023"
result = agent.pipeline(query)

print(str(result))

['Retrieve MSFT R&D spending as a percentage of revenue 2023', 'Retrieve GOOGL R&D spending as a percentage of revenue 2023', 'Retrieve NVDA R&D spending as a percentage of revenue 2023', 'Compare and rank R&D spending as a percentage of revenue across MSFT, GOOGL, and NVDA']
Okay, I will compare the R&D spending as a percentage of revenue for Google, Microsoft, and NVIDIA in 2023.

*   **Google:** In 2023, Research and development expenses as a percentage of revenues 14%.
*   **Microsoft:** In 2023, Research and development expenses as a percentage of revenues 13%.
*   **NVIDIA:** R&D spending as a percentage of revenue in 2023 was 27.2%.


In [184]:
query = "What are the main AI risks mentioned by each company and how do they differ?"
result = agent.pipeline(query)

print(str(result))

Okay, I will analyze the provided texts and extract the main AI risks mentioned by Alphabet and Microsoft, and how they differ.

**Alphabet Inc.:**

Alphabet is very concerned about AI risks and calls out the following:
*   Harmful content
*   Inaccuracies
*   Discrimination
*   Intellectual property infringement or misappropriation
*   Defamation
*   Data privacy
*   Cybersecurity
*   Ethical issues
*   Broad effects on society
*   Unintended consequences, uses, or customization of AI tools and systems
*   Negatively affecting human rights, privacy, employment, or other social concerns

**Microsoft:**

Microsoft focuses on:
*   AI systems being used in ways that are unintended or inappropriate.
*   Fraudulent or abusive activities through cloud-based services.
*   Unauthorized account access
*   Payment fraud
*   Terms of service violations including cryptocurrency mining or launching cyberattacks

**Differences:**

*   **Breadth of Concerns:** Alphabet's risk list is more extensive a

In [None]:
# Formating and Meta Data can be done based on the requirement