In [None]:
# !pip install sec-parser sec-downloader langchain-text-splitters langchain-community langchain-core langchain-chroma
# !pip install langchain-huggingface chromadb sentence-transformers transformers


In [None]:
# Financial Document Retrieval and Analysis using SEC-Parser with LLM-Based Multi-Query Approach
# First, install the required packages
# !pip install sec-parser sec-downloader langchain-text-splitters langchain-community langchain-core langchain-chroma
# !pip install langchain-huggingface chromadb sentence-transformers transformers

# Importing all the necessary libraries
import sec_parser as sp
from sec_downloader import Downloader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough
from transformers import pipeline
import pandas as pd
import numpy as np
import logging
import json
import hashlib

# Set up logging to see query variations
logging.basicConfig()
logging.getLogger("langchain").setLevel(logging.INFO)

# Function to print first n lines (for display purposes)
def print_first_n_lines(text, n=10):
    print("\n".join(text.split("\n")[:n]), "...", sep="\n")

# Initialize the SEC downloader with company name and email (required by SEC EDGAR)
# Replace with your actual information
dl = Downloader("MyCompanyName", "email@example.com")

# Download the latest 10-Q filing for Apple (quarterly financial report)
print("Downloading the latest Apple 10-Q filing...")
apple_html = dl.get_filing_html(ticker="AAPL", form="10-Q")

# Parse the filing HTML into semantic elements
print("Parsing the SEC filing into semantic elements...")
elements = sp.Edgar10QParser().parse(apple_html)

# Build a semantic tree for better navigation
tree = sp.TreeBuilder().build(elements)

# Display a sample of the parsed document structure
print("\nSample of the parsed document structure:")
demo_output = sp.render(tree)
print_first_n_lines(demo_output, n=7)

# Extract text from TextElement types only for vector search
all_texts = []
for element in elements:
    if element.__class__.__name__ == 'TextElement' and hasattr(element, 'text') and element.text.strip():
        all_texts.append({
            'content': element.text,
            'type': 'TextElement',
            'section_id': getattr(element, 'section_id', None)
        })

print(f"\nExtracted {len(all_texts)} text elements from the document")

# Create document objects for the vector database
from langchain_core.documents import Document
documents = []
for item in all_texts:
    # Ensure metadata values are not None (replace with empty string if needed)
    section_id = item['section_id'] if item['section_id'] is not None else ""

    documents.append(
        Document(
            page_content=item['content'],
            metadata={
                'type': item['type'],
                'section_id': section_id
            }
        )
    )

# Split text into smaller chunks for better retrieval
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# Process the documents
split_docs = text_splitter.split_documents(documents)
print(f"Created {len(split_docs)} document chunks after splitting")

# Initialize the embedding model
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a custom function to filter metadata values
def filter_metadata_values(metadata_dict):
    """Filter metadata dictionary to only include simple types."""
    allowed_types = (str, int, float, bool)
    filtered = {}
    for key, value in metadata_dict.items():
        if isinstance(value, allowed_types):
            filtered[key] = value
        else:
            # Replace complex values with empty string
            filtered[key] = ""
    return filtered

# Make sure all metadata is valid (no None values)
for doc in split_docs:
    doc.metadata = filter_metadata_values(doc.metadata)

# Create a vector database from the document chunks
vectordb = Chroma.from_documents(
    documents=split_docs,
    embedding=embedding_function
)

print("\nVector database created successfully!")

# Standard function to perform financial queries
def financial_query(query, k=3):
    """Search for relevant financial information in the SEC filing."""
    try:
        results = vectordb.similarity_search(query, k=k)

        print(f"\nQuery: {query}")
        print(f"Found {len(results)} relevant sections:\n")

        for i, doc in enumerate(results, 1):
            print(f"Result {i} (TextElement):")
            print("-" * 50)
            print(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
            print("-" * 50)
            print()

        return results
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return []

# Helper function to get a document hash for comparison
def get_doc_hash(doc):
    """Create a hash of document content for deduplication."""
    return hashlib.md5(doc.page_content.encode()).hexdigest()

# Function to get unique documents from multiple lists
def get_unique_docs(docs_lists):
    """Combine multiple lists of documents and remove duplicates."""
    unique_docs = {}
    for docs in docs_lists:
        for doc in docs:
            doc_hash = get_doc_hash(doc)
            if doc_hash not in unique_docs:
                unique_docs[doc_hash] = doc
    return list(unique_docs.values())

# Set up the local language model for generating query variations
# Initialize the text generation pipeline for query generation
print("Initializing language model for query generation...")
try:
    text_generator = pipeline(
        "text-generation",
        model="google/flan-t5-small",  # Using a small model for speed
        max_length=200
    )
    llm = HuggingFacePipeline(pipeline=text_generator)
    print("Language model initialized successfully.")
except Exception as e:
    print(f"Error initializing LLM: {str(e)}")
    print("Falling back to rule-based query generation.")
    llm = None

# Set up the prompt template for generating query variations
query_generation_template = """You are an AI assistant helping with financial document retrieval.
Your task is to generate five different versions of the given financial question to retrieve
relevant documents from Apple's SEC filings. By generating multiple perspectives on the user
question, your goal is to help retrieve more comprehensive financial information.

Generate five different ways to ask about: {question}

Format your response with one query per line, no numbering or prefixes."""

query_prompt = PromptTemplate.from_template(query_generation_template)

# Function to generate query variations using language model
def generate_queries_with_llm(question):
    """Generate multiple variations of a query using a language model."""
    if llm is None:
        # Fallback to rule-based generation if LLM initialization failed
        return fallback_generate_query_variations(question)

    try:
        # Generate variations using LLM
        query_variations_text = llm.invoke(query_prompt.format(question=question))

        # Parse the variations from the LLM output
        variations = [q.strip() for q in query_variations_text.split("\n") if q.strip()]

        # Add the original question as the first variation
        if question not in variations:
            variations.insert(0, question)

        # Ensure we have at least the original query plus some variations
        if len(variations) < 2:
            # Add some fallback variations if LLM didn't generate enough
            variations.extend(fallback_generate_query_variations(question)[:3])

        return variations[:5]  # Limit to 5 variations
    except Exception as e:
        print(f"Error generating query variations: {str(e)}")
        # Fallback to rule-based generation
        return fallback_generate_query_variations(question)

# Fallback function for rule-based query generation
def fallback_generate_query_variations(question):
    """Generate variations of a financial query using rules when LLM is unavailable."""
    # Start with the original query
    variations = [question]

    # Generate query variations based on financial topics
    query_lower = question.lower()

    if "revenue" in query_lower or "sales" in query_lower or "growth" in query_lower:
        variations.extend([
            "What is Apple's sales performance in the most recent quarter?",
            "How has Apple's income increased or decreased?",
            "Apple's quarterly revenue trends and financial performance",
            "Year-over-year growth in Apple's product revenues"
        ])

    elif "expenses" in query_lower or "r&d" in query_lower or "cost" in query_lower:
        variations.extend([
            "What is Apple investing in research and development?",
            "Apple's operating costs and expense breakdown",
            "Research and development expenditure at Apple",
            "Apple's cost structure and major expenses"
        ])

    elif "cash" in query_lower or "liquidity" in query_lower or "balance" in query_lower:
        variations.extend([
            "Apple's liquid assets and cash position",
            "How much money does Apple have in its treasury?",
            "Apple's balance sheet cash and short-term investments",
            "Apple's financial liquidity and cash equivalents"
        ])

    elif "risk" in query_lower or "factor" in query_lower or "challenge" in query_lower:
        variations.extend([
            "What challenges and threats is Apple facing?",
            "Apple's business uncertainties and risk disclosures",
            "Potential threats to Apple's financial performance",
            "Risk factors mentioned in Apple's SEC filings"
        ])

    else:
        # Generic variations for other financial queries
        variations.extend([
            f"Information about {question} in Apple's financial reports",
            f"Apple's 10-Q filing details regarding {question}",
            f"Apple's financial statements related to {question}",
            f"Financial analysis of {question} for Apple Inc."
        ])

    return variations

# LLM-based multi-query retrieval function
def llm_multi_query_financial_search(original_query, k=3):
    """
    Implement the multi-query concept using LLM to generate query variations
    and combining the results for better financial document retrieval.
    """
    # Generate query variations using LLM or fallback
    query_variations = generate_queries_with_llm(original_query)

    print(f"\nLLM Multi-Query Approach: '{original_query}'")
    print(f"Generated {len(query_variations)} query variations:")
    for i, query in enumerate(query_variations):
        print(f"  {i+1}. {query}")

    # Process queries in parallel (simulated here with sequential processing)
    all_results_lists = []

    for query in query_variations:
        try:
            results = vectordb.similarity_search(query, k=k)
            all_results_lists.append(results)
        except Exception as e:
            print(f"Error during search for variation '{query}': {str(e)}")

    # Combine results and remove duplicates
    all_results = get_unique_docs(all_results_lists)

    print(f"\nFound {len(all_results)} unique relevant sections across all queries:\n")

    for i, doc in enumerate(all_results, 1):
        print(f"Result {i} (TextElement):")
        print("-" * 50)
        print(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
        print("-" * 50)
        print()

    return all_results

# Examples of financial queries using both approaches
print("\n=== FINANCIAL DOCUMENT ANALYSIS: STANDARD VS LLM MULTI-QUERY ===\n")

# Example queries
financial_queries = [
    "What was Apple's revenue growth in the latest quarter?",
    "What are Apple's R&D expenses?",
    "How much cash and cash equivalents does Apple have?",
    "What are the main risk factors for Apple?"
]

# Compare standard vs. LLM multi-query approach for one example
query = financial_queries[0]  # Use the first query for demonstration
print("\n" + "="*80)
print(f"COMPARING APPROACHES FOR: '{query}'")
print("="*80)

print("\nSTANDARD APPROACH:")
standard_results = financial_query(query)

print("\nLLM MULTI-QUERY APPROACH:")
multi_results = llm_multi_query_financial_search(query)

# Calculate statistics
standard_content_hashes = {get_doc_hash(doc) for doc in standard_results}
multi_content_hashes = {get_doc_hash(doc) for doc in multi_results}

# Find unique documents in multi-query that weren't in standard
unique_to_multi = [doc for doc in multi_results if get_doc_hash(doc) not in standard_content_hashes]

print("\nCOMPARISON SUMMARY:")
print(f"- Standard approach found: {len(standard_results)} documents")
print(f"- LLM Multi-query approach found: {len(multi_results)} documents")
print(f"- Additional unique documents from multi-query: {len(unique_to_multi)}")

if unique_to_multi:
    print("\nEXAMPLES OF UNIQUE DOCUMENTS FOUND ONLY BY LLM MULTI-QUERY:")
    for i, doc in enumerate(unique_to_multi[:2], 1):  # Show first 2 unique docs
        print(f"Unique Doc {i} (TextElement):")
        print("-" * 50)
        print(doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content)
        print("-" * 50)
        print()

print("\n=== FINANCIAL ANALYSIS COMPLETE ===")

Downloading the latest Apple 10-Q filing...
Parsing the SEC filing into semantic elements...

Sample of the parsed document structure:
[1;34mTopSectionTitle[0m: PART I  —  FINANCIAL INFORMATION
├── [1;34mTopSectionTitle[0m: Item 1.    Financial Statements
│   ├── [1;34mTitleElement[0m: CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited)
│   │   ├── [1;34mSupplementaryText[0m: (In millions, except number of ...ousands, and per-share amounts)
│   │   ├── [1;34mTableElement[0m: Table with ~24 rows, ~40 numbers, and 742 characters.
│   │   └── [1;34mSupplementaryText[0m: See accompanying Notes to Conde...solidated Financial Statements.
│   ├── [1;34mTitleElement[0m: CONDENSED CONSOLIDATED STATEMEN...OMPREHENSIVE INCOME (Unaudited)
...

Extracted 76 text elements from the document
Created 76 document chunks after splitting

Vector database created successfully!
Initializing language model for query generation...


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPT

Language model initialized successfully.

=== FINANCIAL DOCUMENT ANALYSIS: STANDARD VS LLM MULTI-QUERY ===


COMPARING APPROACHES FOR: 'What was Apple's revenue growth in the latest quarter?'

STANDARD APPROACH:

Query: What was Apple's revenue growth in the latest quarter?
Found 3 relevant sections:

Result 1 (TextElement):
--------------------------------------------------
Mac net sales increased during the first quarter of 2025 compared to the same quarter in 2024 due primarily to higher net sales of laptops.
--------------------------------------------------

Result 2 (TextElement):
--------------------------------------------------
The Company has historically experienced higher net sales in its first quarter compared to other quarters in its fiscal year due in part to seasonal holiday demand. Additionally, new product and service introductions can significantly impact net sales, cost of sales and operating expenses. The timin...
--------------------------------------------------
