In [1]:
import os
import requests
from pydantic import BaseModel, ValidationError
from typing import Annotated, Literal, Optional, List, Dict, Tuple
from datetime import datetime
from autogen import ConversableAgent, register_function
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
# , StrOutputParser
from langchain.chains import LLMChain
import numpy as np
import pandas as pd
from io import StringIO
import spacy
from bs4 import BeautifulSoup
import time
import json
import re
import pdfkit
import logging
from urllib.request import urlopen
import certifi
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sec_parser import Edgar10QParser, TreeBuilder, TitleElement, TextElement, TopSectionTitle
from langchain_community.document_transformers import LongContextReorder
from pathlib import Path
from typing_extensions import Annotated
from langchain_openai import OpenAIEmbeddings
# logging.basicConfig(level=logging.INFO)
from langchain_core.output_parsers import StrOutputParser



In [5]:
# Load environment variables from a .env file
load_dotenv()

# Fetch the FMP API key from environment variables
FMP_API_KEY = os.getenv("FMP_API_KEY")

# Fetch the OpenAI API key from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [2]:
import psycopg2
import os

# Use this function when you need to truncate the tables since the db currently allows duplicates you need to make sure you truncate if you dont want the duplicates to not interfere with the retriever.
def truncate_tables(tables, host, database, user, password, cascade=False, restart_identity=False):
    """
    Truncate one or more tables in a PostgreSQL database.

    Args:
        tables (list of str): List of table names to truncate.
        host (str): The hostname of the PostgreSQL server.
        database (str): The name of the database.
        user (str): The username to connect to the database.
        password (str): The password to connect to the database.
        cascade (bool): If True, apply the CASCADE option to truncate dependent tables.
        restart_identity (bool): If True, reset any auto-increment counters in the tables.
    
    Returns:
        None
    """
    conn = None
    cursor = None
    try:
        # Establish a connection to the database
        conn = psycopg2.connect(
            host=host,
            database=database,
            user=user,
            password=password
        )
        conn.autocommit = True  # Enable autocommit so that changes are committed immediately
        cursor = conn.cursor()

        # Construct the TRUNCATE TABLE command
        truncate_query = f"TRUNCATE TABLE {', '.join(tables)}"
        
        if restart_identity:
            truncate_query += " RESTART IDENTITY"
        
        if cascade:
            truncate_query += " CASCADE"
        
        truncate_query += ";"
        
        # Execute the command
        cursor.execute(truncate_query)
        print("Tables truncated successfully!")

    except Exception as e:
        print(f"Error: {e}")

    finally:
        # Close the cursor and connection if they were created
        if cursor is not None:
            cursor.close()
        if conn is not None:
            conn.close()

# Example usage
tables_to_truncate = ['langchain_pg_embedding', 'langchain_pg_collection']
truncate_tables(
    tables=tables_to_truncate,
    host="localhost",
    database="vector_db",
    user="postgres",
    password="jeet",
    cascade=True,
    restart_identity=True
)


Tables truncated successfully!


In [3]:
# Define a Pydantic model for prompt input
class PromptInput(BaseModel):
    # Annotated type hint indicating the input query or prompt
    prompt: Annotated[str, "Input query or prompt"]

# Define a Pydantic model for document selector output
class DocumentSelectorOutput(BaseModel):
    # Annotated type hint for the ticker symbol with a default value
    ticker: Annotated[str, "The ticker symbol related to user query (e.g., AAPL, MSFT, BTC-USD)."] = "AAPL"
    # Annotated type hint for the document type with a default value
    document_type: Annotated[str, "The document type related to user query (e.g., Form 10-K, Form 10-Q, Form 8-K)."] = "Form 10-K"
    # Annotated type hint for the year or quarter with a default value
    year: Annotated[str, "The year (format: YYYY) or quarter of the year (format: YYYY QX) related to user query (e.g., 2015, 2001, 2017 Q3)."] = "2023"


In [4]:
def get_jsonparsed_data(url: str) -> dict:
    """
    Fetches and parses JSON data from a given URL.

    Args:
        url (str): The URL to fetch data from.

    Returns:
        dict: The JSON data as a dictionary.
    """
    # Open the URL and fetch the data
    response = urlopen(url, cafile=certifi.where())
    # Read and decode the response data
    data = response.read().decode("utf-8")
    # Parse the JSON data and return as a dictionary
    return json.loads(data)

def is_within_quarter(date_str: str, year: str, start_month: int, end_month: int) -> bool:
    """
    Checks if the given date falls within the specified fiscal quarter.

    Args:
        date_str (str): The date in 'YYYY-MM' format.
        year (str): The fiscal year to check.
        start_month (int): The starting month of the quarter.
        end_month (int): The ending month of the quarter.

    Returns:
        bool: True if the date is within the fiscal quarter, False otherwise.
    """
    # Split the date string to get the year and month
    date_parts = date_str.split('-')
    date_year = date_parts[0]
    date_month = int(date_parts[1])
    # Check if the date is within the specified fiscal quarter
    return date_year == year and start_month <= date_month <= end_month

def which_fiscal_year(date_str: str, fiscal_year: str) -> str:
    """
    Determines the fiscal year for a given date.

    Args:
        date_str (str): The filing date in 'YYYY-MM-DD' format.
        fiscal_year (str): The initial fiscal year.

    Returns:
        str: The fiscal year adjusted for the filing date.
    """
    # Convert the date string to a datetime object
    filing_date = datetime.strptime(date_str.split()[0], "%Y-%m-%d")
    filing_month = filing_date.month
    fiscal_year = int(fiscal_year)

    # Adjust the fiscal year based on the filing month
    if filing_month <= 3:
        fiscal_year += 1
    
    return str(fiscal_year)

In [5]:
def check_ticker_exists(ticker):
    """
    Check if a particular ticker exists using the Financial Modeling Prep API.
    
    Parameters:
    - ticker (str): The ticker symbol to check.
    - api_key (str): Your FMP API key.
    
    Returns:
    - bool: True if the ticker exists, False otherwise.
    """
    url = f"https://financialmodelingprep.com/api/v3/search-ticker?query={ticker}&limit=1&exchange=NASDAQ,NYSE,AMEX&apikey={FMP_API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return len(data) > 0
    else:
        response.raise_for_status()

# Example usage
api_key = FMP_API_KEY
ticker = 'EVAP'
exists = check_ticker_exists(ticker)
print(f"Ticker {ticker} exists: {exists}")


Ticker EVAP exists: False


In [6]:
def document_selector_downloader(
    ticker: Annotated[str, "The ticker symbol related to user query (e.g., AAPL, MSFT, BTC-USD)."] = "AAPL",
    document_type: Annotated[Optional[str], "The document type related to user query (e.g., Form 10-K, Form 10-Q, Form 8-K)."] = "Form 10-K",
    year: Annotated[Optional[str], "The year (format: YYYY) or quarter of the year (format: YYYY QX) related to user query (e.g., 2015, 2001, 2017 Q3)."] = "2023"
) -> Tuple[str, str]:
    """
    Downloads and saves a specific SEC document based on the provided parameters,
    and returns the path to the saved HTML and PDF files.

    Args:
        ticker (str): The ticker symbol of the company.
        document_type (str): The type of document to download (e.g., Form 10-K, Form 10-Q).
        year (str): The year or fiscal quarter to filter documents.
    
    Returns:
        Tuple[str, str]: Paths to the saved HTML and PDF files.
    """
    pipeline_output = {'ticker': ticker, 'document_type': document_type, 'year': year}

    # Fetch the FMP API key from environment variables
    api_key = os.getenv('FMP_API_KEY')
    
    if api_key is None:
        raise ValueError("No API key found. Please set the FMP_API_KEY environment variable.")

    # Map document types to the respective filing type codes
    document_type_mapping = {
        "Form 10-K": "10-k",
        "Form 10-Q": "10-q"
    }

    # Map quarters to their respective start and end months
    quarter_map = {
        "Q1": (1, 3),
        "Q2": (4, 6),
        "Q3": (7, 9),
        "Q4": (10, 12)
    }

    headers = {
        'User-Agent': 'Traderware/j.kagathi@traderverse.io'
    }

    # Get document type
    filing_type = document_type_mapping.get(pipeline_output.get('document_type'))
    if not filing_type:
        raise ValueError(f"Unsupported document type: {pipeline_output.get('document_type')}")

    # Get ticker
    ticker = pipeline_output.get('ticker')
    if not check_ticker_exists(ticker):
        raise ValueError("Ticker doesn't exists.")
    if not ticker:
        raise ValueError("Ticker is required.")
    
    # Get year or quarter of the year
    year_quarter = pipeline_output.get('year')
    year_star = pipeline_output.get('year')
    if not year_quarter:
        raise ValueError("Year/Quarter is required.")
    
    # Determine the output directory based on the document type and date
    if filing_type == "10-k":
        output_dir = os.path.join("data", "sec-edgar-filings", ticker, filing_type, year_quarter)
    elif filing_type == "10-q":
        year, quarter = year_quarter.split()
        start_month, end_month = quarter_map[quarter]
        output_dir = os.path.join("data", "sec-edgar-filings", ticker, filing_type, year, quarter)

    check_html_path = os.path.join(output_dir, 'primary_document.html')
    check_pdf_path = os.path.join(output_dir, 'primary_document.pdf')

    # Check if the files already exist
    if os.path.exists(check_html_path):
        logging.info(f"PDF and HTML already exist at {check_pdf_path} and {check_html_path}. Skipping download.")
        return check_pdf_path, check_html_path

    # Construct the URL for the API request
    url = f"https://financialmodelingprep.com/api/v3/sec_filings/{ticker}?type={filing_type}&page=0&apikey={api_key}"

    try:
        # Fetch data from the API
        fmp_results = get_jsonparsed_data(url)
    except Exception as e:
        raise RuntimeError(f"Failed to fetch data from FMP: {e}")

    # Filter results based on the filing date and type
    if filing_type == "10-k":
        year_quarter = which_fiscal_year(fmp_results[0]['fillingDate'], year_quarter)
        filtered_fmp_results = [i for i in fmp_results if re.search(year_quarter, i['fillingDate'])]
        year = year_star
        output_dir = os.path.join("data", "sec-edgar-filings", ticker, filing_type, year)
    elif filing_type == "10-q":
        year, quarter = year_quarter.split()
        start_month, end_month = quarter_map[quarter]
        filtered_fmp_results = [i for i in fmp_results if is_within_quarter(i['fillingDate'], year, start_month, end_month)]
        output_dir = os.path.join("data", "sec-edgar-filings", ticker, filing_type, year, quarter)

    if not filtered_fmp_results:
        raise ValueError("No matching SEC filings found.")

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Path to the wkhtmltopdf executable
    path_to_wkhtmltopdf = os.getenv('WKHTMLTOPDF_PATH', r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe')
    if not os.path.exists(path_to_wkhtmltopdf):
        raise FileNotFoundError(f"wkhtmltopdf not found at {path_to_wkhtmltopdf}")

    url = filtered_fmp_results[0]['finalLink']
    pdf_path = os.path.join(output_dir, 'primary_document.pdf')
    html_path = os.path.join(output_dir, 'primary_document.html')

    config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)

    try:
        # Save PDF
        pdfkit.from_url(url, pdf_path, configuration=config)
        logging.info(f"PDF generated and saved at {pdf_path}")

        # Save HTML
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        with open(html_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        logging.info(f"HTML generated and saved at {html_path}")
    except Exception as e:
        logging.error(f"Generation failed: {e}")
        raise

    return pdf_path, html_path



In [7]:
# Define the RAG-Fusion prompt template
template = """
You are a helpful assistant that generates multiple search queries based on a single input query.

Step 1: Analyze the input query and understand its context.
Step 2: Break down the query into different aspects or subtopics that are relevant, and explain the reasoning for each aspect.
Step 3: Formulate multiple specific and relevant search queries based on the different aspects identified.

Input Query: {question}

Step 1: Analyze and understand the query.
Step 2: Break down and reasoning:
- Aspect 1: [Reason for aspect 1]
- Aspect 2: [Reason for aspect 2]
- Aspect 3: [Reason for aspect 3]
- Aspect 4: [Reason for aspect 4]

Step 3: Generate specific search queries.

Output (4 queries):
- Query 1
- Query 2
- Query 3
- Query 4
"""

prompt_rag_fusion = ChatPromptTemplate.from_template(template)

# Define the pipeline for generating queries
generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

def multi_query_generator_rag_fusion(
    input: Annotated[str, "User input prompt or query related to qualitative data to undergo query translation"]
) -> List[str]:
    """
    Generates multiple search queries based on a single input prompt using the RAG-Fusion approach.

    Args:
        input (str): The user's input prompt or query for generating related search queries.

    Returns:
        List[str]: A list of generated search queries.
    """
    # Execute the generate_queries pipeline with the input
    pipeline = generate_queries
    queries = pipeline.invoke({"question": input})
    return queries


In [31]:
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = "postgresql+psycopg2://postgres:jeet@localhost:5432/vector_db"
COLLECTION_NAME = 'SEC_doc_chunks'



In [32]:
# Define the global vector store
GLOBAL_VECTOR_STORE = None

def initialize_vector_store():
    global GLOBAL_VECTOR_STORE
    if GLOBAL_VECTOR_STORE is None:
        embeddings_engine = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
        GLOBAL_VECTOR_STORE = PGVector.from_documents(
            embedding=embeddings_engine,
            documents=[],
            collection_name=COLLECTION_NAME,
            connection_string=CONNECTION_STRING,
        )
    return GLOBAL_VECTOR_STORE


In [33]:
import re

def extract_entities_from_path(file_path):
    """
    Extracts ticker, document_type, and year/quarter (if applicable) from the file path.

    Args:
        file_path (str): Single file path.

    Returns:
        dict: A dictionary containing extracted entities.
    """
    extracted_entities = []

    # Regular expression pattern to match the entities
    pattern = r"sec-edgar-filings\\(?P<ticker>[A-Z]+)\\(?P<document_type>10-[kq])\\(?P<year>\d{4})(\\Q(?P<quarter>\d))?"

    match = re.search(pattern, file_path)
    if match:
        entities = match.groupdict()
        # Combine year and quarter if quarter is present
        if entities.get("quarter"):
            entities["year"] = f"{entities['year']} Q{entities['quarter']}"
        entities.pop("quarter")  # Remove quarter after combining
        extracted_entities.append(entities)
    else:
        extracted_entities.append({"error": "Pattern not matched for path", "path": file_path})

    return extracted_entities

# Example usage
file_paths = r"data\sec-edgar-filings\GOOGL\10-k\2023\primary_document.html,data\sec-edgar-filings\GOOGL\10-q\2024\Q2\primary_document.html,data\sec-edgar-filings\GOOGL\10-q\2023\Q3\primary_document.html"
for path in file_paths.split(','):
    print(path)
    entity = extract_entities_from_path(path)
    print(entity)
    print(entity[0].get("ticker"))


data\sec-edgar-filings\GOOGL\10-k\2023\primary_document.html
[{'ticker': 'GOOGL', 'document_type': '10-k', 'year': '2023'}]
GOOGL
data\sec-edgar-filings\GOOGL\10-q\2024\Q2\primary_document.html
[{'ticker': 'GOOGL', 'document_type': '10-q', 'year': '2024 Q2'}]
GOOGL
data\sec-edgar-filings\GOOGL\10-q\2023\Q3\primary_document.html
[{'ticker': 'GOOGL', 'document_type': '10-q', 'year': '2023 Q3'}]
GOOGL


In [34]:
# Function to convert sections to markdown format
def convert_to_markdown(sections, level_to_markdown):
    """
    Converts a list of sections into markdown format based on the section levels.

    Args:
        sections (list): A list of sections parsed from the document.
        level_to_markdown (dict): A mapping from section levels to markdown header syntax.

    Returns:
        str: The document content in markdown format.
    """
    markdown = ""
    for section in sections:
        # Check if the section element is a title
        if isinstance(section.semantic_element, (TopSectionTitle, TitleElement)):
            markdown += f"{level_to_markdown.get(section.semantic_element.level, '#')} {section.semantic_element.text}\n"
        elif isinstance(section.semantic_element, TextElement):
            markdown += f"{section.semantic_element.text}\n"
        # Process child elements
        for child in section.get_descendants():
            if isinstance(child.semantic_element, (TopSectionTitle, TitleElement)):
                markdown += f"{level_to_markdown.get(child.semantic_element.level, '#')} {child.semantic_element.text}\n"
            elif isinstance(child.semantic_element, TextElement):
                markdown += f"{child.semantic_element.text}\n"
    return markdown

# Function to combine sentences with a buffer
def combine_sentences(sentences, buffer_size=1):
    """
    Combines sentences with a specified buffer size to create context for each sentence.

    Args:
        sentences (list): A list of sentences to combine.
        buffer_size (int): The number of surrounding sentences to include for context.

    Returns:
        list: The list of sentences with combined context.
    """
    for i in range(len(sentences)):
        combined_sentence = ""
        # Add previous sentences for context
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]["sentence"] + " "
        combined_sentence += sentences[i]["sentence"]
        # Add following sentences for context
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += " " + sentences[j]["sentence"]
        sentences[i]["combined_sentence"] = combined_sentence
    return sentences

# Function to calculate chunk sizes based on distances and threshold
def calculate_chunk_sizes(sentences, distances, threshold):
    """
    Calculates chunk sizes for a document based on distance between sentence embeddings and a threshold.

    Args:
        sentences (list): A list of sentences.
        distances (list): A list of cosine distances between sentence embeddings.
        threshold (float): The distance threshold for determining chunk boundaries.

    Returns:
        list: The list of text chunks.
    """
    # Determine the breakpoint distance threshold
    breakpoint_distance_threshold = np.percentile(distances, threshold)
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold]
    start_index = 0
    chunks = []
    # Create chunks based on distances above the threshold
    for index in indices_above_thresh:
        end_index = index
        group = sentences[start_index : end_index + 1]
        combined_text = " ".join([d["sentence"] for d in group])
        chunks.append(combined_text)
        start_index = index + 1
    # Add remaining sentences as a final chunk
    if start_index < len(sentences):
        combined_text = " ".join([d["sentence"] for d in sentences[start_index:]])
        chunks.append(combined_text)
    return chunks

# Function to find the appropriate threshold for chunk sizes
def find_appropriate_threshold(sentences, distances, initial_threshold, ceiling):
    """
    Finds the appropriate distance threshold for creating chunks within a size ceiling.

    Args:
        sentences (list): A list of sentences.
        distances (list): A list of cosine distances between sentence embeddings.
        initial_threshold (float): The initial distance threshold.
        ceiling (int): The maximum chunk size in words.

    Returns:
        tuple: The threshold, list of chunks, and list of chunk sizes.
    """
    threshold = initial_threshold
    while threshold > 0:
        chunks = calculate_chunk_sizes(sentences, distances, threshold)
        chunk_sizes = [len(chunk.split()) for chunk in chunks]
        if max(chunk_sizes) <= ceiling:
            break
        threshold -= 1
    return threshold, chunks, chunk_sizes

# Function to calculate cosine distances between sentence embeddings
def calculate_cosine_distances(sentences):
    """
    Calculates cosine distances between consecutive sentence embeddings.

    Args:
        sentences (list): A list of sentences with combined sentence embeddings.

    Returns:
        tuple: The list of distances and the updated list of sentences.
    """
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]["combined_sentence_embedding"]
        embedding_next = sentences[i + 1]["combined_sentence_embedding"]
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        distance = 1 - similarity
        distances.append(distance)
        sentences[i]["distance_to_next"] = distance
    return distances, sentences

# Main function to process financial documents and retrieve relevant sections
def process_financial_documents(file_paths: Annotated[str, "Comma-separated list of paths to the financial document files."], input_query: Annotated[str, "User input prompt or query related to qualitative data to undergo query translation"]) -> List[str]:
    """
    Processes multiple financial documents, chunks them, and stores the chunks in the global PG Vector store.

    Args:
        file_paths (str): Comma-separated list of paths to the financial document files.
        input_query (str): User input prompt or query related to qualitative data to undergo query translation.

    Returns:
        List[str]: A list of retrieved documents based on the input query.
    """
    # Initialize API key and constants
    breakpoint_percentile_threshold = 95
    chunk_size_ceiling = 2000
    all_chunks = []
    retriever_result = []
    embeddings_engine = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    
    # Get or initialize the global vector store
    vectorstore = initialize_vector_store()
    
    # Process each file path
    paths = file_paths.split(',')
    for num, file_path in enumerate(paths):
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        entity = extract_entities_from_path(file_path)

        before_chunking = len(all_chunks)
        
        # Parse the document content
        parser = Edgar10QParser()
        elements = parser.parse(html_content)
        
        # Build the document structure
        tree_builder = TreeBuilder()
        top_level_sections = [item for part in tree_builder.build(elements) for item in part.children]

        # Determine the markdown levels for the sections
        levels = sorted(
            {k.semantic_element.level for k in top_level_sections if isinstance(k.semantic_element, (TopSectionTitle, TitleElement))}
        )
        level_to_markdown = {level: "#" * (i + 2) for i, level in enumerate(levels)}

        # Convert the document to markdown
        raw_essay = convert_to_markdown(top_level_sections, level_to_markdown)

        # Split the markdown content into sentences
        single_sentences_list = re.split(r"(?<=[.#:])\s+", raw_essay)
        sentences = [{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)]

        # Combine sentences with context
        sentences = combine_sentences(sentences)

        # Embed the combined sentences
        embeddings = embeddings_engine.embed_documents([x["combined_sentence"] for x in sentences])

        for i, sentence in enumerate(sentences):
            sentence["combined_sentence_embedding"] = embeddings[i]

        # Calculate cosine distances between sentence embeddings
        distances, sentences = calculate_cosine_distances(sentences)

        # Find the appropriate threshold for chunk sizes
        threshold, chunks, chunk_sizes = find_appropriate_threshold(
            sentences, distances, breakpoint_percentile_threshold, chunk_size_ceiling
        )

        # Assign document ID and store the chunks
        document_id = os.path.basename(file_path)
        for chunk in chunks:
            all_chunks.append(Document(page_content=chunk, metadata={"document_id": document_id, "ticker": entity[0].get("ticker"), "document_type": entity[0].get("document_type"), "year": entity[0].get("year")}))
        
        after_chunking = len(all_chunks)

        print(f"Document{num+1}: Number of documents added to vector store are {after_chunking - before_chunking}")

    # After processing all documents, add them to the global vector store
    vectorstore.add_documents(all_chunks)

    # Retrieve the top k relevant documents based on the input query
    docs = vectorstore.similarity_search_with_score(input_query, k=5)

    # Reorder the documents for better relevance
    reranking = LongContextReorder()
    reranked_docs = reranking.transform_documents(docs)

    # Collect and print the retrieved documents
    for i, (doc, score) in enumerate(reranked_docs):
        print(f"Document {i+1} (Score: {score}):\n{doc.page_content}\n")
        retriever_result.append(f"Document {i+1} {doc.page_content}\n")
    
    return retriever_result



In [35]:
process_financial_documents(r"data\sec-edgar-filings\GOOGL\10-k\2023\primary_document.html,data\sec-edgar-filings\GOOGL\10-q\2024\Q2\primary_document.html,data\sec-edgar-filings\GOOGL\10-q\2023\Q3\primary_document.html","What are the biggest recently discussed risks for Google?")

  warn_deprecated(
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document1: Number of documents added to vector store are 405
Document2: Number of documents added to vector store are 44
Document3: Number of documents added to vector store are 57
Document 1 (Score: 0.16804567004066762):
Also, protecting our intellectual property rights is costly and time consuming. Any increase in the unauthorized use of our intellectual property could make it more expensive to do business and harm our financial condition and operating results. Our strong brands have significantly contributed to the success of our business. Maintaining and enhancing the brands within Google Services, Google Cloud, and Other Bets increases our ability to enter new categories and launch new and innovative products and services that better serve the needs of our users, advertisers, customers, content providers, and other partners. Our brands have been, and may in the future be, negatively affected by a number of factors, including, among others, reputational issues, third-party content 

['Document 1 Also, protecting our intellectual property rights is costly and time consuming. Any increase in the unauthorized use of our intellectual property could make it more expensive to do business and harm our financial condition and operating results. Our strong brands have significantly contributed to the success of our business. Maintaining and enhancing the brands within Google Services, Google Cloud, and Other Bets increases our ability to enter new categories and launch new and innovative products and services that better serve the needs of our users, advertisers, customers, content providers, and other partners. Our brands have been, and may in the future be, negatively affected by a number of factors, including, among others, reputational issues, third-party content shared on our platforms, data privacy and security issues and developments, and product or technical performance failures. For example, if we fail to respond appropriately to the sharing of misinformation or o

In [12]:
# Define the multi-query generator agent
multi_query_generator_agent = ConversableAgent(
    name="Multi_Query_Generator_Agent",
    system_message="You return me the list of multiple queries from a single input query.\
        Make a note that you should call the multi_query_generator_rag_fusion tool.",
    llm_config={"config_list": [{"model": "gpt-4o-mini", "api_key": os.environ["OPENAI_API_KEY"]}]},
    human_input_mode="NEVER",
)

# This agent generates multiple queries based on a single input query.
# It uses the GPT-4o-mini model, and it does not require human input.

# Define the document downloader agent
document_downloader_agent = ConversableAgent(
    name="Document_Downloader_Agent",
    system_message="You select the required ticker/s, financial document/s and year/s for each of the queries generated by multi_query_generator_agent and then download the required financial documents only if the document does not already exist in the folder.",
    llm_config={"config_list": [{"model": "gpt-4o-mini", "api_key": os.environ["OPENAI_API_KEY"]}]},
    human_input_mode="NEVER",
)

# This agent selects the necessary financial documents based on the queries generated by the multi_query_generator_agent.
# It ensures that the required documents are downloaded if they do not already exist in the folder.
# It also uses the GPT-4o-mini model and does not require human input.

# Define the document processor agent
document_processor_agent = ConversableAgent(
    name="Document_Processor_Agent",
    system_message="You gather all the required html financial documents' path, store it in a list and pass it as an argument to process_financial_documents tool.",
    llm_config={"config_list": [{"model": "gpt-4o-mini", "api_key": os.environ["OPENAI_API_KEY"]}]},
    human_input_mode="NEVER",
)

# This agent processes the downloaded financial documents.
# It collects the paths of all the required HTML financial documents, stores them in a list, and passes this list as an argument to the process_financial_documents tool.
# It uses the GPT-4o-mini model and does not require human input.

# Define the user proxy agent
user_proxy = ConversableAgent(
    name="User",
    llm_config=False,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
)

# This agent acts as a proxy for the user.
# It does not use a language model (llm_config=False).
# It terminates the process when a message contains the keyword "TERMINATE".
# It does not require human input.




In [13]:
# Register the multi_query_generator_rag_fusion function
register_function(
    multi_query_generator_rag_fusion,
    caller=multi_query_generator_agent,  # The agent responsible for calling this function
    executor=user_proxy,  # The agent responsible for executing this function
    name="multi_query_generator_rag_fusion",  # The name used to register this function
    description=(
        "This function helps in query translation of a user prompt related to qualitative financial data. "
        "Run this tool before running 'document_selector_downloader' tool and this is the step 1 of rag pipeline for answering questions related to qualitative financial data."
    )
)

# This function translates a user prompt into multiple search queries related to qualitative financial data.
# It is the first step in the retrieval-augmented generation (RAG) pipeline and must be run before the 'document_selector_downloader' tool.

# Register the document_selector_downloader function
register_function(
    document_selector_downloader,
    caller=document_downloader_agent,  # The agent responsible for calling this function
    executor=user_proxy,  # The agent responsible for executing this function
    name="document_selector_downloader",  # The name used to register this function
    description=(
        "This function helps in selecting financial documents for each of the multiquery generated for the input query and then downloading it, if it does not exists already in the folder."
    )
)

# This function selects the required financial documents based on the queries generated by multi_query_generator_rag_fusion.
# It downloads the documents if they do not already exist in the specified folder.

# Register the process_financial_documents function
register_function(
    process_financial_documents,
    caller=document_processor_agent,  # The agent responsible for calling this function
    executor=user_proxy,  # The agent responsible for executing this function
    name="process_financial_documents",  # The name used to register this function
    description=(
        "This function helps in processesing multiple financial documents, chunks them, stores the chunks in a single FAISS vector store, retrieves it and displays it."
    )
)

# This function processes multiple financial documents by chunking them and storing the chunks in a single FAISS vector store.
# It then retrieves the relevant chunks based on the input query and displays the results.




In [22]:
# Define the input query
input_query = "What are the biggest recently discussed risks for Google?"

# Initiate chats with the user proxy, defining a sequence of steps to handle the query
chat_results = user_proxy.initiate_chats(
    [
        {
            "recipient": multi_query_generator_agent,
            "message": f"""
            {input_query}

            Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')} and follow the instructions keeping in mind the input query as well as the time period.
            """,
            "max_turns": 2,  # Maximum number of turns allowed for this conversation
            "summary_method": "last_msg",  # Method to summarize the conversation
        },
        {
            "recipient": document_downloader_agent,
            "message": f"""
            You select the required ticker/s, financial document/s and year/s for each of the queries generated by multi_query_generator_agent and then download the required financial documents only if the document does not already exists in the folder.
            Make a note that if Form 10-Q is selected as the argument, then use the correct format for Form 10-Q (YYYY QX).
            Make a note that there is no fourth quarter Q4, companies file for Form 10K instead of filing Form 10Q. So you can only choose Q1, Q2 or Q3.
            Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')}.
            Make a note that you have to send the file paths generated to the next tool.
            Make a note that if you don't recieve any kind of inputs then use the default values. 
            """,
            "max_turns": 2,  # Maximum number of turns allowed for this conversation
            "summary_method": "last_msg",  # Method to summarize the conversation
        },
        {
            "recipient": document_processor_agent,
            "message": f"""
            Strictly gather all the document's HTML file paths generated by the document_downloader_agent, do not make your own file path but relay the file path as generated by the document_downloader_agent, store it in a list and pass it as an argument along with the input query: {input_query}, to the process_financial_documents tool
            in order to process multiple financial documents, chunk them, store the chunks in a single FAISS vector store, retrieve documents and display it.
            Make a note that the input query to be used is the same argument which was passed to the multi_query_generator_agent that is {input_query}.
            """,
            "max_turns": 2,  # Maximum number of turns allowed for this conversation
            "summary_method": "last_msg",  # Method to summarize the conversation
        },
    ]
)

# Explanation of each step:
# 1. The user_proxy initiates a chat with multi_query_generator_agent, providing the input query and current date.
#    - The agent is instructed to follow the input query and the specified time period while generating multiple queries.
# 2. The user_proxy initiates a chat with document_downloader_agent, passing the queries generated by the multi_query_generator_agent.
#    - The agent is instructed to select and download the required financial documents based on the generated queries.
#    - It is provided with guidelines on how to handle Form 10-Q formats and the correct quarters to choose.
#    - The current date is also provided for context.
# 3. The user_proxy initiates a chat with document_processor_agent, passing the file paths generated by the document_downloader_agent.
#    - The agent is instructed to gather all the document paths, store them in a list, and pass them as an argument to the process_financial_documents tool along with the input query.
#    - It is reminded to use the same input query that was passed to the multi_query_generator_agent.


[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mUser[0m (to Multi_Query_Generator_Agent):


            What are the biggest recently discussed risks for Google?

            Make a note that today's date is 2024-08-14 and follow the instructions keeping in mind the input query as well as the time period.
            

--------------------------------------------------------------------------------
[33mMulti_Query_Generator_Agent[0m (to User):

[32m***** Suggested tool call (call_2YV8Fl0oK7RnmdbX7O7AgHfj): multi_query_generator_rag_fusion *****[0m
Arguments: 
{"input":"biggest recently discussed risks for Google as of August 2024"}
[32m*************************************************************************************************[0m

---------------------------------------------------------------------------

  response = urlopen(url, cafile=certifi.where())


[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[33mUser[0m (to Document_Downloader_Agent):

[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_Lv1iNeXp3e7Gm9u9ryAZgmiT) *****[0m
["data\\sec-edgar-filings\\GOOGL\\10-k\\2023\\primary_document.pdf", "data\\sec-edgar-filings\\GOOGL\\10-k\\2023\\primary_document.html"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_fZ78lQ5TVk6YLQqA97BiDPCx) *****[0m
["data\\sec-edgar-filings\\GOOGL\\10-q\\2024\\Q2\\primary_document.pdf", "data\\sec-edgar-filings\\GOOGL\\10-q\\2024\\Q2\\primary_document.html"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Doc

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document1: Number of documents added to vector store are 405
Document2: Number of documents added to vector store are 44
Document3: Number of documents added to vector store are 39
Document 1 (Score: 0.16804567004066762):
Also, protecting our intellectual property rights is costly and time consuming. Any increase in the unauthorized use of our intellectual property could make it more expensive to do business and harm our financial condition and operating results. Our strong brands have significantly contributed to the success of our business. Maintaining and enhancing the brands within Google Services, Google Cloud, and Other Bets increases our ability to enter new categories and launch new and innovative products and services that better serve the needs of our users, advertisers, customers, content providers, and other partners. Our brands have been, and may in the future be, negatively affected by a number of factors, including, among others, reputational issues, third-party content 

In [14]:

input_query = "What are the strengths and weakness of NVDA and AMD over the past two years?"
chat_results = user_proxy.initiate_chats(
    [
        {
            "recipient": multi_query_generator_agent,
            "message": f"""
{input_query}

Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')} and follow the instructions keeping in mind the input query as well as the time period.
""",
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_downloader_agent,
            "message": f"""You select the required ticker/s, financial document/s and year/s for each of the queries generated by multi_query_generator_agent and then download the required financial documents only if the document does not already exits in the folder."
            Make a note that if Form 10-Q is selected as the argument, then use the correct format for Form 10-Q (YYYY QX).
            Make a note that there is no fourth quarter Q4, companies file for Form 10K instead of filing Form 10Q. So you can only choose Q1, Q2 or Q3.
            Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')}.
            Make a note that you have to send the file paths generated to the next tool.
            Make a note that if you don't recieve any kind of inputs then use the default values. 
                        """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_processor_agent,
            "message": f"""Strictly gather all the document's html file path generated by the document_downloader_agent, donot make your own file path but relay the file path as generated by the document_downloader_agent, store it in a list and pass it as an argument along with the input query: {input_query}, to the process_financial_documents tool\
            in order to process multiple financial documents, chunk them, store the chunks in a single FAISS vector store, retrieve documents and display it.
            Make a note that the input query to be used is the same argument which was passed to the multi_query_generator_agent that is {input_query}.
            """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        # {
        #     "recipient": retrieved_document_displayer_agent,
        #     "message": f"""Display the results retrieved results from document_processor_agent.
        #     """,
        #     "max_turns": 2,
        #     "summary_method": "last_msg",
        # },
    ]
)

[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mUser[0m (to Multi_Query_Generator_Agent):


What are the strengths and weakness of NVDA and AMD over the past two years?

Make a note that today's date is 2024-08-14 and follow the instructions keeping in mind the input query as well as the time period.


--------------------------------------------------------------------------------
[33mMulti_Query_Generator_Agent[0m (to User):

[32m***** Suggested tool call (call_HUwSUI6x99h4zqriTLFinoAE): multi_query_generator_rag_fusion *****[0m
Arguments: 
{"input":"strengths and weaknesses of NVDA and AMD over the past two years"}
[32m*************************************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>

  response = urlopen(url, cafile=certifi.where())


[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[33mUser[0m (to Document_Downloader_Agent):

[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_oX3AyfWbpiXqakCtSweBGNr5) *****[0m
["data\\sec-edgar-filings\\NVDA\\10-k\\2022\\primary_document.pdf", "data\\sec-edgar-filings\\NVDA\\10-k\\2022\\primary_document.html"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_1nj5S7HM9aoN7NFlULK7ClwN) *****[0m
["data\\sec-edgar-filings\\NVDA\\10-k\\2021\\primary_document.pdf", "data\\sec-edgar-filings\\NVDA\\10-k\\2021\\primary_d

  warn_deprecated(
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document1: Number of documents added to vector store are 171


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document2: Number of documents added to vector store are 181
Document3: Number of documents added to vector store are 82


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document4: Number of documents added to vector store are 337


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document5: Number of documents added to vector store are 253
Document6: Number of documents added to vector store are 145
Document 1 (Score: 0.20315702027377025):
Comparison of Five-Year Cumulative Total Returns
# Advanced Micro Devices, S&P 500 Index and S&P 500 Semiconductor Index
The following graph shows a five-year comparison of cumulative total return on our common stock, the S&P 500 Index and the S&P 500 Semiconductor Index from December 31, 2016 through December 25, 2021. The past performance of our common stock is no indication of future performance.

Document 2 (Score: 0.20445790343972614):
Our software solutions carry certification for a number of professional software vendor applications as well as being optimized for modern gaming titles. In November 2021, AMD introduced the Radeon PRO V620, a data center GPU using the RDNA 2 architecture and incorporating new capabilities including ray tracing acceleration and Infinity Cache. ##

Document 3 (Score: 0.20519671533047756):
O

In [40]:
input_query = "What is life?"
chat_results = user_proxy.initiate_chats(
    [
        {
            "recipient": multi_query_generator_agent,
            "message": f"""
{input_query}

Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')} and follow the instructions keeping in mind the input query as well as the time period.
""",
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_downloader_agent,
            "message": f"""You select the required ticker/s, financial document/s and year/s for each of the queries generated by multi_query_generator_agent and then download the required financial documents only if the document does not already exits in the folder."
            Make a note that if Form 10-Q is selected as the argument, then use the correct format for Form 10-Q (YYYY QX).
            Make a note that there is no fourth quarter Q4, companies file for Form 10K instead of filing Form 10Q. So you can only choose Q1, Q2 or Q3.
            Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')}.
            Make a note that you have to send the file paths generated to the next tool.
            Make a note that if you don't recieve any kind of inputs then use the default values. 
                        """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_processor_agent,
            "message": f"""Strictly gather all the document's html file path generated by the document_downloader_agent, donot make your own file path but relay the file path as generated by the document_downloader_agent, store it in a list and pass it as an argument along with the input query: {input_query}, to the process_financial_documents tool\
            in order to process multiple financial documents, chunk them, store the chunks in a single FAISS vector store, retrieve documents and display it.
            Make a note that the input query to be used is the same argument which was passed to the multi_query_generator_agent that is {input_query}.
            """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        # {
        #     "recipient": retrieved_document_displayer_agent,
        #     "message": f"""Display the results retrieved results from document_processor_agent.
        #     """,
        #     "max_turns": 2,
        #     "summary_method": "last_msg",
        # },
    ]
)

[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mUser[0m (to Multi_Query_Generator_Agent):


What is life?

Make a note that today's date is 2024-07-31 and follow the instructions keeping in mind the input query as well as the time period.


--------------------------------------------------------------------------------
[33mMulti_Query_Generator_Agent[0m (to User):

[32m***** Suggested tool call (call_WWywBIcjaJHvT0HiHKIqjo4S): multi_query_generator_rag_fusion *****[0m
Arguments: 
{"input":"What is life?"}
[32m*************************************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION multi_query_generator_rag_fusion...[0m
[33mUser[0m (to Multi_Query_Generator_Agent):


  response = urlopen(url, cafile=certifi.where())


[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[33mUser[0m (to Document_Downloader_Agent):

[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_HI7jpAcSpNx8gxoZnhGkNtRq) *****[0m
["data\\sec-edgar-filings\\LIFE\\10-k\\2023\\primary_document.pdf", "data\\sec-edgar-filings\\LIFE\\10-k\\2023\\primary_document.html"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_ahXpSwTWqKogT6fTWE6CQzg1) *****[0m
Error: No matching SEC filings found.
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Document_Downloader_Agent):

[32m***** Res

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  return self.parse_from_tags(
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document1: Number of documents added to vector store are 350
Document2: Number of documents added to vector store are 182
Number of vectors in the store: 532
Document 1:
Item 2.

Document 2:
Item 1A.

Document 3:
5.

Document 4:
None.

Document 5:
Item 1A.

[33mUser[0m (to Document_Processor_Agent):

[33mUser[0m (to Document_Processor_Agent):

[32m***** Response from calling tool (call_6HzhvVcWZxL0Jhk5u1MYQmPe) *****[0m
["Document 1:\nItem 2.\n", "Document 2:\nItem 1A.\n", "Document 3:\n5.\n", "Document 4:\nNone.\n", "Document 5:\nItem 1A.\n"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mDocument_Processor_Agent[0m (to User):

The financial documents have been processed successfully. Here are the extracted contents:

1. **Document 1:** 
   - Item 2.
   
2. **Document 2:** 
   - Item 1A.
   
3. **Document 3:** 
   - 5.
   
4. **Document 4:** 
   - None.
   
5.

In [15]:
input_query = "What is evaporation?"
chat_results = user_proxy.initiate_chats(
    [
        {
            "recipient": multi_query_generator_agent,
            "message": f"""
{input_query}

Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')} and follow the instructions keeping in mind the input query as well as the time period.
""",
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_downloader_agent,
            "message": f"""You select the required ticker/s, financial document/s and year/s for each of the queries generated by multi_query_generator_agent and then download the required financial documents only if the document does not already exits in the folder."
            Make a note that if Form 10-Q is selected as the argument, then use the correct format for Form 10-Q (YYYY QX).
            Make a note that there is no fourth quarter Q4, companies file for Form 10K instead of filing Form 10Q. So you can only choose Q1, Q2 or Q3.
            Make a note that today's date is {datetime.now().strftime('%Y-%m-%d')}.
            Make a note that you have to send the file paths generated to the next tool.
            Make a note that if you don't recieve any kind of inputs then use the default values. 
                        """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        {
            "recipient": document_processor_agent,
            "message": f"""Strictly gather all the document's html file path generated by the document_downloader_agent, donot make your own file path but relay the file path as generated by the document_downloader_agent, store it in a list and pass it as an argument along with the input query: {input_query}, to the process_financial_documents tool\
            in order to process multiple financial documents, chunk them, store the chunks in a single FAISS vector store, retrieve documents and display it.
            Make a note that the input query to be used is the same argument which was passed to the multi_query_generator_agent that is {input_query}.
            """,
            "max_turns": 2,
            "summary_method": "last_msg",
        },
        # {
        #     "recipient": retrieved_document_displayer_agent,
        #     "message": f"""Display the results retrieved results from document_processor_agent.
        #     """,
        #     "max_turns": 2,
        #     "summary_method": "last_msg",
        # },
    ]
)

[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mUser[0m (to Multi_Query_Generator_Agent):


What is evaporation?

Make a note that today's date is 2024-08-14 and follow the instructions keeping in mind the input query as well as the time period.


--------------------------------------------------------------------------------
[33mMulti_Query_Generator_Agent[0m (to User):

[32m***** Suggested tool call (call_3aFhGr3Jvb0XhswUW6BhYNX4): multi_query_generator_rag_fusion *****[0m
Arguments: 
{"input":"What is evaporation?"}
[32m*************************************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION multi_query_generator_rag_fusion...[0m
[33mUser[0m (to Multi_Query_Gene

  response = urlopen(url, cafile=certifi.where())


[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[35m
>>>>>>>> EXECUTING FUNCTION document_selector_downloader...[0m
[33mUser[0m (to Document_Downloader_Agent):

[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_6scfof6nlwPwF7cYNrOiKNKX) *****[0m
["data\\sec-edgar-filings\\AAPL\\10-k\\2023\\primary_document.pdf", "data\\sec-edgar-filings\\AAPL\\10-k\\2023\\primary_document.html"]
[32m**********************************************************************[0m

--------------------------------------------------------------------------------
[33mUser[0m (to Document_Downloader_Agent):

[32m***** Response from calling tool (call_F6iY03gdPV1iJQ75DB5sxdam) *****[0m
["data\\sec-edgar-filings\\MSFT\\10-q\\2024\\Q2\\primary_document.pdf", "data\\sec-edgar-filings\\MSFT\\10-q\\2024\\Q2\\primary_document.html"]
[32m**********************************************************************[0m

------------------------------------

  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document1: Number of documents added to vector store are 158
Document2: Number of documents added to vector store are 131


  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)
  elements = step.process(elements)


Document3: Number of documents added to vector store are 405
Document4: Number of documents added to vector store are 43
Document 1 (Score: 0.22598577892694305):
3.

Document 2 (Score: 0.22598577892694305):
3.

Document 3 (Score: 0.22797453928600253):
Some of our plans may take years to deliver results, particularly where they involve building new large-scale infrastructure with long lead times. So as our business continues to evolve, we expect our emissions to rise before dropping towards our absolute emissions reduction target.To benefit the people and places where we operate, we have set goals to replenish 120% of the freshwater volume we consume, on average, across our offices and data centers by 2030 and to help restore and improve the quality of water and health of ecosystems in the communities where we operate. We also aim to maximize the reuse of finite resources across our operations, products, and supply chains. Our circularity principles focus on designing out waste from the