In [2]:
import os
import requests
from pydantic import BaseModel, ValidationError
from typing import Annotated, Literal, Optional
from datetime import datetime
from autogen import ConversableAgent, register_function
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

load_dotenv()

True

In [3]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.chains import LLMChain
import numpy as np
import pandas as pd
from io import StringIO
import spacy
from bs4 import BeautifulSoup
import time
import json
import re
from typing import List, Dict, Annotated

In [4]:
FMP_API_KEY = os.getenv("FMP_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.getenv('LANGCHAIN_TRACING_V2')
os.getenv('LANGCHAIN_ENDPOINT')
os.getenv('LANGCHAIN_API_KEY')

'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'

In [5]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'

In [6]:
def sec_insights_documents_selector(input: Annotated[str, "Qualitative data input prompt"]) -> List[Dict[str, str]]:

    llm = ChatOpenAI(
        model="gpt-4",
        temperature=0.3
    )
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", '''
                Your task is to identify and fill the following entities from the user's prompt:

                1. **ticker**: The abbreviation of the company's name.
                2. **document_type**: 'Form 10K' (for annual financial details), 'Form 10Q' (for quarterly financial details), etc.
                3. **year**: 
                   - If 'Form 10K' is chosen, the year should be in the format 'YYYY' (e.g., 2021, 2019).
                   - If 'Form 10Q' is chosen, the year should be in the format 'YYYY QX' (e.g., 2023 Q3, 2022 Q2).

                User prompts may be explicit or implicit and may not specify all required entities. Use context to infer missing details.

                If a user prompt is complex, you may break it into simpler, atomic prompts and provide separate outputs for each.
             
                Make a note that there is no fourth quarter Q4, companies file for Form 10K instead of filing Form 10Q. So you can only choose Q1, Q2 or Q3.

                ### Guidelines for Handling Qualitative Financial Questions:

                1. **Map to Relevant Documents**: Use the most relevant financial documents (annual or quarterly reports) that are likely to contain information on market trends, growth prospects, or other qualitative aspects.

                2. **Use Representative Entities**: When the user question is about a sector or general trends, use representative entities such as sector ETFs (e.g., XLK for technology) to provide the context.

                3. **Infer Contextual Details**: Use the context provided in the user's question to infer missing details and make reasonable assumptions about the tickers, document types and years.

                4. **Use top companies tickers**: If you are stuck which company's ticker to choose then use one or more top performing company's ticker in the respective field.
             
                5. **Use recent years**: If you are stuck with which year to choose then choose the latest year or years or latest quarter based on the context of the user prompt.

                ### Examples:
                1. **Prompt**: 'Please provide the report on recent developments at Apple.'
                   **Output**: 
                     ticker: AAPL
                     document_type: Form 10K
                     year: 2023
                     
                     ticker: AAPL
                     document_type: Form 10Q
                     year: 2023 Q3

                2. **Prompt**: 'Please provide the report on third quarter for Apple for the years 2021 and 2022.'
                   **Output**: 
                     ticker: AAPL
                     document_type: Form 10Q
                     year: 2021 Q3
                     
                     ticker: AAPL
                     document_type: Form 10Q
                     year: 2022 Q3

                3. **Prompt**: 'What are the prevailing market trends in the technology sector?'
                   **Output**: 
                     ticker: XLK
                     document_type: Form 10K
                     year: 2023
                     
                     ticker: XLK
                     document_type: Form 10Q
                     year: 2023 Q3

                4. **Prompt**: 'What are the growth prospects of Tesla in the next five years?'
                   **Output**: 
                     ticker: TSLA
                     document_type: Form 10K
                     year: 2023
                     
                     ticker: TSLA
                     document_type: Form 10Q
                     year: 2023 Q3

                Return 'TERMINATE' when the task is completed.
            '''),
            ("user", "{input}")
        ]
    )
    
    chain = prompt | llm
    
    output = chain.invoke({"input": input})

    try:
        documents = output.content.split('\n')
        result = []
        current_doc = {}

        for line in documents:
            if 'ticker:' in line:
                if current_doc:
                    result.append(current_doc)
                    current_doc = {}
                current_doc['ticker'] = line.split('ticker:')[1].strip()
            elif 'document_type:' in line:
                current_doc['document_type'] = line.split('document_type:')[1].strip()
            elif 'year:' in line:
                current_doc['year'] = line.split('year:')[1].strip()
        
        if current_doc:  # Add the last document
            result.append(current_doc)

        return result

    except Exception as e:
        raise ValueError(f"Failed to parse output: {output.content}. Error: {e}")

# MultiQuery and RagFusion

In [17]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)




generate_queries = (
prompt_perspectives 
| ChatOpenAI(temperature=0) 
| StrOutputParser() 
| (lambda x: x.split("\n"))
)

# def generate_queries(prompt):
#     # Step 1: Generate responses using ChatOpenAI with the given prompt
#     response = ChatOpenAI(temperature=0)(prompt)
    
#     # Step 2: Parse the output to a string
#     parsed_output = StrOutputParser()(response)
    
#     # Step 3: Split the output by newline characters
#     queries = parsed_output.split("\n")
    
#     return queries




In [None]:
# # Define the pipeline using LangChain expression language
# pipeline = generate_queries | (lambda queries: [sec_insights_documents_selector(query) for query in queries])

# # Function to run the entire pipeline
# def run_pipeline(user_input: str) -> List[Dict[str, str]]:
#     result_documents = pipeline.invoke({"question": user_input})
#     # Flatten the list of lists into a single list
#     return [doc for sublist in result_documents for doc in sublist]

# # Example usage
# user_input = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
# result_documents = run_pipeline(user_input)

# # Output the results
# for doc in result_documents:
#     print(doc)

In [20]:
# Define the pipeline using LangChain expression language
pipeline = generate_queries | (lambda queries: [{"query": query, "documents": sec_insights_documents_selector(query)} for query in queries])

# Function to run the entire pipeline
def run_pipeline(user_input: str) -> List[Dict[str, List[Dict[str, str]]]]:
    result_documents = pipeline.invoke({"question": user_input})
    return result_documents

# Example usage
user_input = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. What are the most prominent risks associated with Amazon.com Inc. (AMZN) according to discussions?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
---
Query: 2. Can you list the major risks that have been widely debated in relation to Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
---
Query: 3. What are the primary concerns and risks that have garnered significant attention regarding Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
---
Query: 4. Which risks have been extensively discussed in the context of Amazon.com Inc. (AMZN) and are considered significant?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 

In [27]:
user_input = "Provide detailed information on the corporate governance of Amazon, including the roles and responsibilities of key executive officers and the board of directors\
    for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. What are the specific details regarding the corporate governance structure of Amazon, focusing on the key executive officers and board of directors?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. Can you elaborate on the roles and responsibilities of the key executive officers and board of directors within Amazon's corporate governance framework?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 3. How is the corporate governance of Amazon structured, and what are the specific duties assigned to the key executive officers and board members?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZ

In [28]:
user_input = "Generate a detailed analysis of Amazon's revenue streams over the past three fiscal years. Break down the revenue by segments such as North America, International, and AWS. Also, provide a year-over-year growth comparison for each segment."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. Can you provide a comprehensive breakdown of Amazon's revenue streams for the last three fiscal years, including segments like North America, International, and AWS? Additionally, could you compare the year-over-year growth for each segment?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2020'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. How has Amazon's revenue evolved across different segments like North America, International, and AWS in the past three fiscal years? Can you also analyze the year-over-year growth rates for each segment?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2020'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year':

In [29]:
user_input = "Summarize the operating expenses for Amazon over the last two quarters, highlighting key components such as cost of sales, fulfillment, technology and content, marketing, general and administrative, and other operating expenses. Include any significant changes or trends."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. Can you provide a summary of Amazon's operating expenses in the past two quarters, focusing on key categories like cost of sales, fulfillment, technology and content, marketing, general and administrative, and other operating expenses? Please highlight any notable changes or trends.
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q2'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. What are the operating expenses of Amazon for the last two quarters, specifically looking at cost of sales, fulfillment, technology and content, marketing, general and administrative, and other operating expenses? Are there any significant changes or trends worth mentioning?
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q2'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
-------------------------------------------------------------

In [30]:
user_input = "Provide an overview of Amazon's cash flow statements for the recent fiscal year and the most recent quarter. Focus on cash flows from operating activities, investing activities, and financing activities, and explain any major shifts or patterns observed."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. Can you summarize Amazon's cash flow statements for the latest fiscal year and quarter, specifically looking at operating, investing, and financing activities?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. What are the key highlights of Amazon's cash flow statements in the recent fiscal year and quarter, particularly in terms of operating, investing, and financing activities?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 3. How have Amazon's cash flows from operating, investing, and financing activities evolved in the most recent fiscal year and quarter?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '202

In [31]:
user_input = "Identify and explain the top five risk factors mentioned in Amazon's most recent Form 10-K. Discuss how these risks could potentially impact the company's financial performance and operations."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. What are the key risk factors highlighted in Amazon's latest Form 10-K, and how might they influence the company's financial performance and operations?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
------------------------------------------------------------------------------------------
Query: 2. Can you list and elaborate on the top five risk factors outlined in Amazon's most recent Form 10-K, and analyze their potential effects on the company's financial performance and operations?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
------------------------------------------------------------------------------------------
Query: 3. What are the significant risk factors identified in Amazon's most recent Form 10-K, and how could they affect the company's financial performance and operations?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
------------------------------------------------------------------------------------------

In [32]:
user_input = "What are the strengths and weakness of NVDA and AMD over the past two years?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. Can you provide an analysis of the strengths and weaknesses of NVDA and AMD from the last two years?
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2022 Q3'}
{'ticker': 'AMD', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'AMD', 'document_type': 'Form 10Q', 'year': '2022 Q3'}
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2021 Q3'}
{'ticker': 'AMD', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'AMD', 'document_type': 'Form 10Q', 'year': '2021 Q3'}
------------------------------------------------------------------------------------------
Query: 2. How have NVDA and AMD performed in terms of strengths and weaknesses in the past two years?
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 

In [33]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

generate_queries = (
prompt_rag_fusion 
| ChatOpenAI(temperature=0) 
| StrOutputParser() 
| (lambda x: x.split("\n"))
)

In [34]:
# Define the pipeline using LangChain expression language
pipeline = generate_queries | (lambda queries: [{"query": query, "documents": sec_insights_documents_selector(query)} for query in queries])

# Function to run the entire pipeline
def run_pipeline(user_input: str) -> List[Dict[str, List[Dict[str, str]]]]:
    result_documents = pipeline.invoke({"question": user_input})
    return result_documents

# Example usage
user_input = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. What are the biggest risks to Amazon's e-commerce dominance?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. How does Amazon's reliance on third-party sellers pose a risk to its business?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 3. What are the potential risks of Amazon's expansion into new markets, such as healthcare and grocery?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 4. How does regulatory scrutiny and antitrust concerns p

# Decomposition

In [35]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [36]:
generate_queries_decomposition = (
    prompt_decomposition 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

# Define the pipeline using LangChain expression language
pipeline = generate_queries | (lambda queries: [{"query": query, "documents": sec_insights_documents_selector(query)} for query in queries])

# Function to run the entire pipeline
def run_pipeline(user_input: str) -> List[Dict[str, List[Dict[str, str]]]]:
    result_documents = pipeline.invoke({"question": user_input})
    return result_documents

# Example usage
user_input = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. What are the biggest cybersecurity risks for Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. What are the biggest regulatory risks for Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 3. What are the biggest competition risks for Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 4. What are the biggest supply chain risks for Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'docu

In [37]:
user_input = "Provide detailed information on the corporate governance of Amazon, including the roles and responsibilities of key executive officers and the board of directors\
    for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. "Amazon corporate governance structure"
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 2. "Amazon executive officers roles and responsibilities"
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------
Query: 3. "Amazon board of directors members"
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
------------------------------------------------------------------------------------------
Query: 4. "Amazon corporate governance policies and practices"
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
----------------------------------------------

In [38]:
user_input = "Generate a detailed analysis of Amazon's revenue streams over the past three fiscal years. Break down the revenue by segments such as North America, International, and AWS. Also, provide a year-over-year growth comparison for each segment."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. Amazon revenue breakdown by segment (North America, International, AWS) for fiscal year 2020
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2020'}
------------------------------------------------------------------------------------------
Query: 2. Year-over-year growth comparison of Amazon's revenue segments (North America, International, AWS) for fiscal years 2018, 2019, and 2020
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2018'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2019'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2020'}
------------------------------------------------------------------------------------------
Query: 3. Detailed analysis of Amazon's revenue streams in North America for the past three fiscal years
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2020'}
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2019'}
----------------

In [39]:
user_input = "What are the strengths and weakness of NVDA and AMD over the past two years?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: 1. NVDA stock performance analysis over the past two years
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2021'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2022 Q3'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2022 Q2'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2022 Q1'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2021 Q3'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2021 Q2'}
{'ticker': 'NVDA', 'document_type': 'Form 10Q', 'year': '2021 Q1'}
------------------------------------------------------------------------------------------
Query: 2. AMD financial performance comparison with NVDA in the last two years
{'ticker': 'AMD', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'AMD', 'document_type': 'Form 10Q', 'year': '2022 Q3'}
{'ticker': 'NVDA', 'document_type': 'Form 10K', 'year': '2022'}
{'ticker': 'NVDA', 'document_type': 'Form

# StepBack 

In [18]:
prompt_step_back = ChatPromptTemplate.from_messages(
    [
        ("system", 
         """You are an expert in world knowledge. Your task is to paraphrase a specific question into a more generic and broader question, making it easier to answer. Here are a few examples:"""
        ),
        # Few-shot examples
        ("user", 
            "Could the members of The Police perform lawful arrests?"
        ),
        ("assistant", 
            "What can the members of The Police do?"
        ),
        ("user", 
            "Jan Sindel was born in what country?"
        ),
        ("assistant", 
            "What is Jan Sindel’s personal history?"
        ),
        # New question
        ("user", "{question}"),
        ("assistant", "")
    ]
)


In [19]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
prompt_step_back 
| ChatOpenAI(temperature=0) 
| StrOutputParser() 
| (lambda x: x.split("\n"))
)


In [20]:
# Define the pipeline using LangChain expression language
pipeline = generate_queries | (lambda queries: [{"query": query, "documents": sec_insights_documents_selector(query)} for query in queries])

# Function to run the entire pipeline
def run_pipeline(user_input: str) -> List[Dict[str, List[Dict[str, str]]]]:
    result_documents = pipeline.invoke({"question": user_input})
    return result_documents

# Example usage
user_input = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: What are the major risks associated with investing in Amazon.com Inc. (AMZN)?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------


In [21]:
user_input = "Summarize the operating expenses for Amazon over the last two quarters, highlighting key components such as cost of sales, fulfillment, technology and content, marketing, general and administrative, and other operating expenses. Include any significant changes or trends."
result_documents = run_pipeline(user_input)

# Output the results
for result in result_documents:
    print(f"Query: {result['query']}")
    for doc in result['documents']:
        print(doc)
    print("---"*30)

Query: What are the recent trends in Amazon's operating expenses and key components?
{'ticker': 'AMZN', 'document_type': 'Form 10K', 'year': '2023'}
{'ticker': 'AMZN', 'document_type': 'Form 10Q', 'year': '2023 Q3'}
------------------------------------------------------------------------------------------


# HyDE

In [25]:
from langchain.prompts import ChatPromptTemplate

# HyDE document genration
template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_hyde | ChatOpenAI(temperature=0) | StrOutputParser() 
)

# Run
question = "What are the biggest discussed risks for Amazon.com Inc. (AMZN)?"
generate_queries.invoke({"question":question})

#need retriever to proceed further..

"One of the biggest discussed risks for Amazon.com Inc. (AMZN) is regulatory scrutiny and antitrust concerns. As one of the largest e-commerce companies in the world, Amazon has faced increasing scrutiny from regulators and lawmakers regarding its market dominance and potential anti-competitive practices. This includes concerns over Amazon's control over online retail, cloud computing services, and its impact on small businesses. Additionally, Amazon's data practices and privacy policies have also come under scrutiny, with concerns over the company's collection and use of customer data. These regulatory risks could potentially lead to fines, legal battles, and restrictions on Amazon's business practices, which could impact the company's financial performance and reputation."