# Sentiment Analysis Pipeline: FinBERT

**Objective**: Apply `ZiweiChen/FinBERT` to the Fed Speeches dataset using a standardized pipeline.
**Methodology**:
1. **Load Data**: Processed sentences (`data/master/fed_master_corpus_focused.csv`).
2. **Inference**: Classify each sentence as Hawkish, Dovish, or Neutral.
3. **Index Calculation**: Compute Net Sentiment Index using the "Score-based" formula: $Index = \frac{Hawkish_{score} - Dovish_{score}}{Total_{count}}$.
4. **Aggregation**: Aggregate by Date/Meeting.

In [3]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
from tqdm.auto import tqdm
import os
import sys

# Ensure project root is in path for utils import
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom utilities
try:
    from utils.utilities import calculate_net_sentiment_scores, calculate_net_sentiment_counts, get_sentiment_label_FinBERT_FOMC
    print("Successfully imported utilities.")
except ImportError as e:
    print(f"Import Error: {e}. Please ensure 'utils' package is available.")

# Config
MODEL_NAME = "ProsusAI/finbert"
# Use the FOCUSED dataset (Data Clean 2)
INPUT_FILE = "../data/master/fed_master_corpus_focused.csv"
OUTPUT_FILE = "../data/result/FinBERT/finbertori_inference_results.csv"
INDEX_OUTPUT_FILE = "../data/result/FinBERT/monthly_index_FinBERTori_FOMC.csv"

# Set Device
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {device} ({torch.cuda.get_device_name(0) if device==0 else 'CPU'}) ")

Successfully imported utilities.
Using device: -1 (CPU) 


In [4]:
# 1. Load Data
if not os.path.exists(INPUT_FILE):
    # Fallback paths
    INPUT_FILE = r"e:\Textming\data\master\fed_master_corpus.csv"
    
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} sentences from FOCUSED dataset.")

df.head()

Loaded 8297 sentences from FOCUSED dataset.


Unnamed: 0,date,text,section,source,speaker,word_count,month_year
0,2018-01-31,The manager of the System Open Market Account ...,Developments in Financial Markets,Minutes,,24,2018-01
1,2018-01-31,Domestic financial market conditions eased con...,Staff Review of Financial Situation,Minutes,,11,2018-01
2,2018-01-31,A strengthening outlook for economic growth in...,Staff Review of Financial Situation,Minutes,,23,2018-01
3,2018-01-31,"U.S. equity prices, Treasury yields, and marke...",Staff Review of Financial Situation,Minutes,,31,2018-01
4,2018-01-31,"In addition, the dollar depreciated broadly am...",Staff Review of Financial Situation,Minutes,,29,2018-01


In [5]:
# 2. Initialize Model Pipeline (Official Method)
print(f"Loading model: {MODEL_NAME}...")
try:
    # Using BertForSequenceClassification with num_labels=3 per official guidance
    tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, truncation=True, max_length=512)
    print("Model loaded successfully.")
except Exception as e:
    print(f"ERROR Loading Model: {e}")
    # Stop execution or set flag
    nlp = None

Loading model: ProsusAI/finbert...


Device set to use cpu


Model loaded successfully.


In [6]:
# 3. Run Inference
BATCH_SIZE = 32
sentences = df['text'].astype(str).tolist()
results = []

if nlp:
    print("Starting Inference...")
    for i in tqdm(range(0, len(sentences), BATCH_SIZE)):
        batch = sentences[i:i + BATCH_SIZE]
        try:
            preds = nlp(batch)
            results.extend(preds)
        except Exception as e:
            print(f"Error at batch {i}: {e}")
            results.extend([{'label': 'Neutral', 'score': 0.0}] * len(batch))
else:
    print("Model not loaded. Filling with Defaults (Neutral) for testing structure.")
    results = [{'label': 'Neutral', 'score': 0.0} for _ in sentences]

# Attach results
df['raw_sentiment'] = [x['label'] for x in results]
df['sentiment_score'] = [x['score'] for x in results]

Starting Inference...


  0%|          | 0/260 [00:00<?, ?it/s]

In [7]:
# 4. Map Labels
# Using imported function: get_sentiment_label_FinBERT_FOMC

print(df['raw_sentiment'].value_counts())

# Map Labels to Fed Context
df['sentiment'] = df['raw_sentiment'].apply(get_sentiment_label_FinBERT_FOMC)

print("Mapped sentiment distribution:")
print(df['sentiment'].value_counts())

# Show mapping examples
print("\nLabel mapping examples:")
mapping_examples = df[['raw_sentiment', 'sentiment']].drop_duplicates().head(10)
for _, row in mapping_examples.iterrows():
    print(f"{row['raw_sentiment']} -> {row['sentiment']}")

raw_sentiment
neutral     3389
negative    2566
positive    2342
Name: count, dtype: int64
Mapped sentiment distribution:
sentiment
Neutral    3389
Dovish     2566
Hawkish    2342
Name: count, dtype: int64

Label mapping examples:
neutral -> Neutral
positive -> Hawkish
negative -> Dovish


In [8]:
# 5. Calculate Sentiment Index (Score-based)
# Formula: (avg Dovish score - avg Hawkish score)

df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')

# Use calculate_net_sentiment_scores for Score-based method
monthly_index = df.groupby('month').apply(calculate_net_sentiment_scores, include_groups=False).reset_index(name='sentiment_index')
monthly_index['month'] = monthly_index['month'].dt.to_timestamp()

# Save
df.to_csv(OUTPUT_FILE, index=False)
monthly_index.to_csv(INDEX_OUTPUT_FILE, index=False)
print(f"Saved index to {INDEX_OUTPUT_FILE}")

Saved index to ../data/result/FinBERT/monthly_index_FinBERTori_FOMC.csv
