#**Financial** QA using TF-IDF + Cosine Similarity

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ----------------------------
# 1️⃣  Dummy Financial Data
# ----------------------------
data = [
    {
        "company": "Apple",
        "document": "10-Q Q1 2024",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q1 2024 increased to $45B due to strong iPhone sales."},
            {"id": 2, "text": "Operating income was $15B."},
            {"id": 3, "text": "Net income increased to $12B."},
            {"id": 4, "text": "Cash flow from operations was $18B."},
            {"id": 5, "text": "Long-term debt is $10B."}
        ]
    },
    {
        "company": "Apple",
        "document": "10-Q Q2 2025",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q2 2025 increased to $50B due to strong iPhone sales."},
            {"id": 2, "text": "Operating income was $18B, showing a 10% increase from last quarter."},
            {"id": 3, "text": "Net income increased to $14B."},
            {"id": 4, "text": "Cash flow from operations was $20B."},
            {"id": 5, "text": "Long-term debt is $10B."}
        ]
    },
    {
        "company": "Microsoft",
        "document": "10-Q Q1 2024",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q1 2024 was $40B, mainly from cloud services."},
            {"id": 2, "text": "Operating income increased to $12B."},
            {"id": 3, "text": "Net income was $10B."},
            {"id": 4, "text": "Cash flow from operations was $15B."},
            {"id": 5, "text": "Long-term debt is $8B."}
        ]
    },
    {
        "company": "Microsoft",
        "document": "10-Q Q2 2025",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q2 2025 was $45B, primarily from cloud services."},
            {"id": 2, "text": "Operating income increased to $15B."},
            {"id": 3, "text": "Net income was $12B."},
            {"id": 4, "text": "Cash flow from operations was $18B."},
            {"id": 5, "text": "Long-term debt is $8B."}
        ]
    },
    {
        "company": "Amazon",
        "document": "10-Q Q1 2024",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q1 2024 reached $75B, driven by e-commerce and AWS."},
            {"id": 2, "text": "Operating income was $9B."},
            {"id": 3, "text": "Net income increased to $7B."},
            {"id": 4, "text": "Cash flow from operations was $12B."},
            {"id": 5, "text": "Long-term debt is $20B."}
        ]
    },
    {
        "company": "Amazon",
        "document": "10-Q Q2 2025",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q2 2025 was $80B, mostly from e-commerce and AWS."},
            {"id": 2, "text": "Operating income was $10B."},
            {"id": 3, "text": "Net income was $7B."},
            {"id": 4, "text": "Cash flow from operations was $15B."},
            {"id": 5, "text": "Long-term debt is $25B."}
        ]
    },
    {
        "company": "Google",
        "document": "10-Q Q2 2025",
        "paragraphs": [
            {"id": 1, "text": "Revenue for Q2 2025 increased to $65B, driven by ads and cloud services."},
            {"id": 2, "text": "Operating income was $20B."},
            {"id": 3, "text": "Net income reported as $16B."},
            {"id": 4, "text": "Cash flow from operations was $25B."},
            {"id": 5, "text": "Long-term debt is $5B."}
        ]
    }
]








In [61]:
# ----------------------------
# Preparing paragraphs & metadata
# ----------------------------
paragraphs = []
metadata = []

for company_data in data:
    company = company_data["company"]
    document = company_data["document"]
    for para in company_data["paragraphs"]:
        paragraphs.append(para["text"])
        metadata.append({
            "company": company,
            "document": document,
            "paragraph_id": para["id"]
        })

In [62]:

# ----------------------------
# TF-IDF Vectorization
# ----------------------------
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(paragraphs)

In [63]:
# ----------------------------
# Search function (single confident answer)
# ----------------------------
def search(query, company=None):
    query_vec = vectorizer.transform([query])

    # Filter paragraphs by company
    filtered_indices = [i for i, m in enumerate(metadata) if company is None or m['company'].lower() == company.lower()]

    if not filtered_indices:
        return "No data available for this company."

    filtered_tfidf = tfidf_matrix[filtered_indices]
    similarities = cosine_similarity(query_vec, filtered_tfidf)[0]

    # Pick the paragraph with highest similarity
    best_idx_local = similarities.argmax()
    best_idx = filtered_indices[best_idx_local]

    return {
        "company": metadata[best_idx]["company"],
        "document": metadata[best_idx]["document"],
        "paragraph_id": metadata[best_idx]["paragraph_id"],
        "text": paragraphs[best_idx],
        "similarity": similarities[best_idx_local]
    }


In [64]:
# ----------------------------
# Example Queries
# ----------------------------
queries = [
    ("What was Apple’s revenue in Q2 2025?", "Apple"),
    ("Tell me Microsoft’s net income.", "Microsoft"),
    ("Cash flow of Apple?", "Apple"),
    ("Long-term debt of Microsoft?", "Microsoft")
]

for q, c in queries:
    print(f"\nQuestion: {q}")
    ans = search(q, company=c)
    print(f"Company: {ans['company']}, Document: {ans['document']}, Paragraph ID: {ans['paragraph_id']}")
    print(f"Answer: {ans['text']}, Similarity: {ans['similarity']:.2f}")


Question: What was Apple’s revenue in Q2 2025?
Company: Apple, Document: 10-Q Q2 2025, Paragraph ID: 1
Answer: Revenue for Q2 2025 increased to $50B due to strong iPhone sales., Similarity: 0.42

Question: Tell me Microsoft’s net income.
Company: Microsoft, Document: 10-Q Q1 2024, Paragraph ID: 3
Answer: Net income was $10B., Similarity: 0.68

Question: Cash flow of Apple?
Company: Apple, Document: 10-Q Q1 2024, Paragraph ID: 4
Answer: Cash flow from operations was $18B., Similarity: 0.59

Question: Long-term debt of Microsoft?
Company: Microsoft, Document: 10-Q Q1 2024, Paragraph ID: 5
Answer: Long-term debt is $8B., Similarity: 0.71


#Model Summary

Input: User query (e.g., “What was Apple’s revenue in Q2 2025?”) and optionally a company name.

Data: Structured financial paragraphs from multiple companies and years. Each paragraph is a metric statement (Revenue, Net Income, Cash Flow, Debt, etc.).

Processing Steps:

Filter by company: Only consider paragraphs belonging to the requested company.

Vectorize text: Use TF-IDF to convert all paragraphs and the query into numeric vectors.

Compute similarity: Use cosine similarity to compare the query with all filtered paragraphs.

Select answer: Return the paragraph with the highest similarity score as the single confident answer.

Output: Most relevant paragraph with similarity score.

#ADVANTAGES
1.Simple and lightweight: No heavy embeddings or deep learning needed.

2.Easy implementation: Can be built with Python and scikit-learn.

3.Fast for small datasets: Works well for hundreds to a few thousand paragraphs.

4.Company-aware: Filtering prevents mixing answers across companies.

5.Interpretable: You can see why a paragraph was chosen (highest similarity).

6.Deterministic: Same query always returns the same answer.

#DRAWBACKS
- Poor semantic understanding: Only matches exact words, not meaning.

- Scales poorly: Slow for very large datasets.

- No context across paragraphs: Cannot reason over multiple statements.

- Sensitive to wording: Small changes in query can affect results.

- Static answers: Cannot handle new companies or unseen metrics.