In [1]:
# Import necessary libraries
import spacy
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Load Spacy model for NER with fallback
try:
    nlp = spacy.load("en_core_web_trf")
except (OSError, ValueError):
    print("Transformer model not found; loading smaller model.")
    nlp = spacy.load("en_core_web_sm")

Transformer model not found; loading smaller model.


In [3]:
# Expanded dataset covering a range of intents
data = [
    # Metric Inquiry
    {"query": "What is the current EPS of Reliance Industries?", "intent": ["Metric Inquiry"]},
    {"query": "Show the P/B ratio of Infosys.", "intent": ["Metric Inquiry"]},
    {"query": "What was the dividend yield for Amazon last year?", "intent": ["Metric Inquiry"]},
    {"query": "Can you provide the debt-to-equity ratio of Adani Enterprises?", "intent": ["Metric Inquiry"]},
    {"query": "What is the latest ROE of Apple?", "intent": ["Metric Inquiry"]},
    {"query": "What is Tesla's revenue growth rate this fiscal year?", "intent": ["Metric Inquiry"]},
    {"query": "Display the quick ratio for Alphabet.", "intent": ["Metric Inquiry"]},
    {"query": "What is the P/E ratio of Netflix?", "intent": ["Metric Inquiry"]},

    # Historical Performance
    {"query": "How did Google's stock price trend over the past 5 years?", "intent": ["Historical Performance"]},
    {"query": "Show Tesla's performance since its IPO.", "intent": ["Historical Performance"]},
    {"query": "Provide Tata Motors' annual growth from 2018 to 2022.", "intent": ["Historical Performance"]},
    {"query": "How much revenue did Amazon generate in Q3 2022?", "intent": ["Historical Performance"]},
    {"query": "What was Meta's net profit in 2019?", "intent": ["Historical Performance"]},
    {"query": "Show Microsoft's quarterly revenue trends for the past 2 years.", "intent": ["Historical Performance"]},
    {"query": "Display Apple's year-over-year revenue growth.", "intent": ["Historical Performance"]},
    {"query": "How did Berkshire Hathaway perform in Q1 of 2021?", "intent": ["Historical Performance"]},

    # Comparative Analysis
    {"query": "Compare HDFC Bank and ICICI Bank's ROE for 2021.", "intent": ["Comparative Analysis"]},
    {"query": "Who performed better in 2020: Apple or Microsoft?", "intent": ["Comparative Analysis"]},
    {"query": "Analyze profitability between Tesla and Ford.", "intent": ["Comparative Analysis"]},
    {"query": "Compare Tesla and GM's market capitalization for the past 3 years.", "intent": ["Comparative Analysis"]},
    {"query": "Compare Alphabet and Meta on their P/E ratios.", "intent": ["Comparative Analysis"]},
    {"query": "Compare Amazon and Walmart's Q4 earnings for the past three years.", "intent": ["Comparative Analysis"]},
    {"query": "Compare revenue growth between Facebook and Twitter in 2022.", "intent": ["Comparative Analysis"]},
    {"query": "Which has higher dividends: Coca-Cola or PepsiCo?", "intent": ["Comparative Analysis"]},

    # Fundamental Data Request
    {"query": "Show the latest income statement for Apple.", "intent": ["Fundamental Data Request"]},
    {"query": "Provide Amazon's balance sheet.", "intent": ["Fundamental Data Request"]},
    {"query": "Fetch Reliance's quarterly financials.", "intent": ["Fundamental Data Request"]},
    {"query": "Retrieve the cash flow statement for Meta Platforms.", "intent": ["Fundamental Data Request"]},
    {"query": "Display Walmart's most recent financial ratios.", "intent": ["Fundamental Data Request"]},
    {"query": "Get Tata Motors' annual balance sheet.", "intent": ["Fundamental Data Request"]},
    {"query": "Provide Tesla's last quarterly income statement.", "intent": ["Fundamental Data Request"]},
    {"query": "Show consolidated financials for Procter & Gamble.", "intent": ["Fundamental Data Request"]},

    # News Impact
    {"query": "What recent events impacted Facebook's stock?", "intent": ["News Impact"]},
    {"query": "Has there been significant news for Tata Steel recently?", "intent": ["News Impact"]},
    {"query": "List events affecting Netflix's Q2 performance.", "intent": ["News Impact"]},
    {"query": "Did any news impact Berkshire Hathaway's stock price?", "intent": ["News Impact"]},
    {"query": "What market events impacted Apple’s stock last year?", "intent": ["News Impact"]},
    {"query": "What recent announcements affected Microsoft's financials?", "intent": ["News Impact"]},
    {"query": "What announcements have impacted Tesla's share price?", "intent": ["News Impact"]},
    {"query": "What corporate events influenced Amazon's stock price this quarter?", "intent": ["News Impact"]},

    # Forecasting & Predictions
    {"query": "What are Alphabet's growth forecasts for the next quarter?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Microsoft's predicted EPS for 2023?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Apple's expected revenue for the next fiscal year?", "intent": ["Forecasting & Predictions"]},
    {"query": "What are the projected earnings for Netflix during the holiday season?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Amazon’s revenue growth forecast for the next year?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Tesla’s outlook for profitability next quarter?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Meta's projected EBITDA for Q1 2024?", "intent": ["Forecasting & Predictions"]},
    {"query": "What is Google's predicted market cap by year-end?", "intent": ["Forecasting & Predictions"]},

    # Combined Intent Examples
    {"query": "Show Tesla's revenue and net income growth over the last 3 quarters and compare it with Ford.", "intent": ["Historical Performance", "Comparative Analysis"]},
    {"query": "How did Apple's revenue growth and EPS trend in 2020?", "intent": ["Historical Performance", "Metric Inquiry"]},
    {"query": "Compare Microsoft and Google’s quarterly income for 2021 and predict Q1 2022.", "intent": ["Comparative Analysis", "Forecasting & Predictions"]},
    {"query": "Show Facebook’s revenue, compare it with Twitter, and predict trends for the next year.", "intent": ["Fundamental Data Request", "Comparative Analysis", "Forecasting & Predictions"]},
]


In [4]:
# Prepare dataset for multi-label classification
df = pd.DataFrame(data)
df = pd.concat([df, pd.get_dummies(df['intent'].apply(pd.Series).stack()).groupby(level=0).sum()], axis=1)
X = df["query"]
y = df.iloc[:, 2:]  # Multi-label target columns

In [5]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 2: Convert text to BERT embeddings for intent classification
def get_bert_embeddings(text_list):
    """Convert text queries to BERT embeddings."""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    embeddings = []
    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())  # CLS token embedding
    return embeddings

In [7]:
# Generate BERT embeddings for training and validation sets
X_train_embedded = get_bert_embeddings(X_train)
X_val_embedded = get_bert_embeddings(X_val)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [8]:
# Step 3: Train multi-label classifier using BERT embeddings
classifier = MultiOutputClassifier(LogisticRegression())
classifier.fit(X_train_embedded, y_train)

In [14]:
# Step 4: Enhanced Financial Entity Extraction with Custom Rules
def extract_entities(query):
    """Extract named entities with regex rules for financial terms."""
    doc = nlp(query)
    entities = {"companies": [], "metrics": [], "time_periods": [], "events": []}

    # Extract entities with SpaCy
    for ent in doc.ents:
        if ent.label_ == "ORG":
            entities["companies"].append(ent.text)
        elif ent.label_ in ["DATE", "TIME"]:
            entities["time_periods"].append(ent.text)
        elif ent.label_ in ["MONEY", "PERCENT"]:
            entities["metrics"].append(ent.text)

    # Add regex-based extraction for specific financial metrics
    custom_metrics = [
        r"\bP/E ratio\b", r"\bEPS\b", r"\brevenue growth\b", r"\bnet income growth\b", r"\bdividend yield\b"
    ]
    for metric in custom_metrics:
        if re.search(metric, query, re.IGNORECASE):  # Uses the 'query' parameter passed to the function
            entities["metrics"].append(metric.strip(r"\b"))

    return entities


In [15]:
query = "Compare Apple and Microsoft on EPS growth and P/E ratio over the last 5 years."
extracted_entities = extract_entities(query)


In [16]:
# Step 5: Query-to-Action Mapping and Data Retrieval Placeholder
def map_query_to_action(intents, entities):
    """Map intents and entities to actions with data retrieval simulation."""
    actions = []
    for intent in intents:
        if intent == "Metric Inquiry" and entities['metrics']:
            actions.append(f"Fetch metric data for {', '.join(entities['companies'])} with metrics: {', '.join(entities['metrics'])}")
        elif intent == "Historical Performance" and entities['time_periods']:
            actions.append(f"Retrieve historical data for {', '.join(entities['companies'])} over {', '.join(entities['time_periods'])}")
        elif intent == "Comparative Analysis" and len(entities['companies']) > 1:
            actions.append(f"Compare metrics between {entities['companies'][0]} and {entities['companies'][1]}")
        elif intent == "Fundamental Data Request":
            actions.append(f"Retrieve financial statement for {', '.join(entities['companies'])}")
        elif intent == "News Impact":
            actions.append(f"Retrieve recent news for {', '.join(entities['companies'])}")
        elif intent == "Forecasting & Predictions" and entities['metrics']:
            actions.append(f"Retrieve growth forecast for {', '.join(entities['companies'])} on metrics: {', '.join(entities['metrics'])}")
    return actions

In [17]:
# Placeholder function for connecting to financial data APIs
def retrieve_financial_data(action):
    """Simulate data retrieval from an API or database."""
    print(f"Executing action: {action}")
    # Actual API calls go here

In [18]:

# Step 6: Query Decomposition for Complex Queries
def decompose_query(query):
    """Decompose complex queries into subqueries if multiple actions are needed."""
    # Placeholder decomposition logic (could be expanded with more complex logic)
    if "compare" in query.lower() and "and" in query.lower():
        subqueries = query.split(" and ")
    else:
        subqueries = [query]
    return subqueries

In [19]:
# Step 7: Test with New Example Query
query = "Compare Apple and Microsoft on EPS growth and P/E ratio over the last 5 years."
subqueries = decompose_query(query)

In [20]:
print("\nProcessing Subqueries:")
for subquery in subqueries:
    predicted_intents = classifier.predict([get_bert_embeddings([subquery])[0]])[0]
    predicted_intents = y.columns[predicted_intents == 1].tolist()
    print("\nPredicted Intents for Subquery:", subquery)
    print("Intents:", predicted_intents)

    extracted_entities = extract_entities(subquery)
    print("Extracted Entities:", extracted_entities)

    mapped_actions = map_query_to_action(predicted_intents, extracted_entities)
    print("\nMapped Actions for Subquery:")
    for action in mapped_actions:
        retrieve_financial_data(action)


Processing Subqueries:





Predicted Intents for Subquery: Compare Apple
Intents: []
Extracted Entities: {'companies': [], 'metrics': [], 'time_periods': [], 'events': []}

Mapped Actions for Subquery:





Predicted Intents for Subquery: Microsoft on EPS growth
Intents: []
Extracted Entities: {'companies': ['Microsoft', 'EPS'], 'metrics': ['EPS'], 'time_periods': [], 'events': []}

Mapped Actions for Subquery:





Predicted Intents for Subquery: P/E ratio over the last 5 years.
Intents: []
Extracted Entities: {'companies': [], 'metrics': ['P/E ratio'], 'time_periods': ['the last 5 years'], 'events': []}

Mapped Actions for Subquery:


In [21]:
# Step 8: Evaluation of Intent Classification
def evaluate_intent_classifier(X_val_embedded, y_val, classifier):
    """Evaluate the intent classifier with BERT embeddings."""
    y_pred = classifier.predict(X_val_embedded)
    print("\nIntent Classification Report:")
    print(classification_report(y_val, y_pred, target_names=y.columns))

In [22]:
# Evaluate the model on the validation set
evaluate_intent_classifier(X_val_embedded, y_val, classifier)


Intent Classification Report:
                           precision    recall  f1-score   support

     Comparative Analysis       1.00      0.67      0.80         3
Forecasting & Predictions       0.67      0.50      0.57         4
 Fundamental Data Request       0.00      0.00      0.00         0
   Historical Performance       0.00      0.00      0.00         2
           Metric Inquiry       0.00      0.00      0.00         2
              News Impact       1.00      1.00      1.00         1

                micro avg       0.83      0.42      0.56        12
                macro avg       0.44      0.36      0.40        12
             weighted avg       0.56      0.42      0.47        12
              samples avg       0.45      0.41      0.42        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
