# Sentiment Analysis Pipeline: FinBERT-FOMC

**Objective**: Apply `ZiweiChen/FinBERT-FOMC` to the Fed Speeches dataset using a standardized pipeline.
**Methodology**:
1. **Load Data**: Processed sentences (`data/processed/fed_speeches_sentences.csv`).
2. **Inference**: Classify each sentence as Hawkish, Dovish, or Neutral.
3. **Index Calculation**: Compute Net Sentiment Index using the "Hard Count" formula: $Index = \frac{Hawkish - Dovish}{Total}$.
4. **Aggregation**: Aggregate by Date/Meeting.

In [7]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm.auto import tqdm
import os
import sys

# Ensure project root is in path for utils import
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom utilities
try:
    from utils.utilities import calculate_net_sentiment_scores, get_sentiment_label_FinBERT_FOMC
    print("Successfully imported utilities.")
except ImportError as e:
    print(f"Import Error: {e}. Please ensure 'utils' package is available.")

# Config
MODEL_NAME = "ZiweiChen/FinBERT-FOMC"
INPUT_FILE = "../data/master/fed_master_corpus.csv"
OUTPUT_FILE = "../data/result/FinBERT-FOMC/finbert_inference_results.csv"
INDEX_OUTPUT_FILE = "../data/result/FinBERT-FOMC/monthly_index_FinBERT_FOMC.csv"

# Set Device
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {device} ({torch.cuda.get_device_name(0) if device==0 else 'CPU'}) ")

Successfully imported utilities.
Using device: -1 (CPU) 


In [8]:
# 1. Load Data
if not os.path.exists(INPUT_FILE):
    # Fallback paths
    INPUT_FILE = r"e:\Textming\data\processed\fed_speeches_sentences.csv"
    
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} sentences.")
df.head()

Loaded 7471 sentences.


Unnamed: 0,date,text,section,source,speaker,word_count,month_year
0,2018-01-31,The manager of the System Open Market Account ...,Developments in Financial Markets,Minutes,,24,2018-01
1,2018-01-31,Domestic financial market conditions eased con...,Staff Review of Financial Situation,Minutes,,11,2018-01
2,2018-01-31,A strengthening outlook for economic growth in...,Staff Review of Financial Situation,Minutes,,23,2018-01
3,2018-01-31,"U.S. equity prices, Treasury yields, and marke...",Staff Review of Financial Situation,Minutes,,31,2018-01
4,2018-01-31,"In addition, the dollar depreciated broadly am...",Staff Review of Financial Situation,Minutes,,29,2018-01


In [9]:
# 2. Initialize Model Pipeline
print(f"Loading model: {MODEL_NAME}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, truncation=True, max_length=512)
    print("Model loaded successfully.")
except Exception as e:
    print(f"ERROR Loading Model: {e}")
    # Stop execution or set flag
    nlp = None

Loading model: ZiweiChen/FinBERT-FOMC...


Device set to use cpu


Model loaded successfully.


In [10]:
# 3. Run Inference
BATCH_SIZE = 32
sentences = df['text'].astype(str).tolist()
results = []

if nlp:
    print("Starting Inference...")
    for i in tqdm(range(0, len(sentences), BATCH_SIZE)):
        batch = sentences[i:i + BATCH_SIZE]
        try:
            preds = nlp(batch)
            results.extend(preds)
        except Exception as e:
            print(f"Error at batch {i}: {e}")
            results.extend([{'label': 'Neutral', 'score': 0.0}] * len(batch))
else:
    print("Model not loaded. Filling with Defaults (Neutral) for testing structure.")
    results = [{'label': 'Neutral', 'score': 0.0} for _ in sentences]

# Attach results
df['raw_sentiment'] = [x['label'] for x in results]
df['sentiment_score'] = [x['score'] for x in results]

Starting Inference...


  0%|          | 0/234 [00:00<?, ?it/s]

In [13]:
# 4. Map Labels
# Using imported function: get_sentiment_label_FinBERT_FOMC

print(df['raw_sentiment'].value_counts())

# 4. Map RoBERTa Labels to Fed Context
# Using imported function: get_sentiment_label_RoBERTa

df['sentiment'] = df['raw_sentiment'].apply(get_sentiment_label_FinBERT_FOMC)

print("Mapped sentiment distribution:")
print(df['sentiment'].value_counts())

# Show mapping examples
print("\nLabel mapping examples:")
mapping_examples = df[['raw_sentiment', 'sentiment']].drop_duplicates().head(10)
for _, row in mapping_examples.iterrows():
    print(f"{row['raw_sentiment']} → {row['sentiment']}")

raw_sentiment
Neutral     3442
Negative    2268
Positive    1761
Name: count, dtype: int64
Mapped sentiment distribution:
sentiment
Neutral    3442
Hawkish    2268
Dovish     1761
Name: count, dtype: int64

Label mapping examples:
Neutral → Neutral
Positive → Dovish
Negative → Hawkish


In [14]:
# 5. Calculate Sentiment Index
# Using score-based calculation: (Dovish_score - Hawkish_score) / count

df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')

monthly_index = df.groupby('month').apply(calculate_net_sentiment_scores).reset_index(name='sentiment_score')
monthly_index['month'] = monthly_index['month'].dt.to_timestamp()

# Save
df.to_csv(OUTPUT_FILE, index=False)
monthly_index.to_csv(INDEX_OUTPUT_FILE, index=False)
print(f"Saved index to {INDEX_OUTPUT_FILE}")

Saved index to ../data/result/FinBERT-FOMC/monthly_index_FinBERT_FOMC.csv


  monthly_index = df.groupby('month').apply(calculate_net_sentiment_scores).reset_index(name='sentiment_score')
