In [1]:
!pip install transformers torch pandas matplotlib seaborn nltk huggingface_hub pandas_datareader tqdm



In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, pipeline
import torch
from huggingface_hub import login
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas_datareader.data as web
import datetime
import os

# Set plot style
sns.set_style("whitegrid")

In [3]:
# Login to Hugging Face (Required for gated models like FOMC-RoBERTa)
# You will need your HF Access Token. Run this cell and paste your token.
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [4]:
# Load Data
file_path = r"e:\Textming\data\fed_minutes_sentences_structured.csv"
df_original = pd.read_csv(file_path)
print(f"Loaded {len(df_original)} sentences.")
df_original.head()

FileNotFoundError: [Errno 2] No such file or directory: 'e:\\Textming\\data\\fed_minutes_sentences_structured.csv'

In [None]:
def get_sentiment_label(result, model_key):
    """Normalize labels based on model type"""
    label = str(result['label']).lower()
    
    if "finbert" in model_key.lower():
        if label == "negative":
            return "Hawkish"
        elif label == "positive":
            return "Dovish"
        else:
            return "Neutral"
            
    elif "roberta" in model_key.lower():
        # FOMC-RoBERTa labels
        if "hawkish" in label:
            return "Hawkish"
        elif "dovish" in label:
            return "Dovish"
        else:
            return "Neutral"
            
    return label.title()

def calculate_index(group):
    """Calculate (Hawkish - Dovish) / Total"""
    counts = group['sentiment'].value_counts()
    hawkish = counts.get('Hawkish', 0)
    dovish = counts.get('Dovish', 0)
    neutral = counts.get('Neutral', 0)
    total = hawkish + dovish + neutral
    
    if total == 0:
        return 0
        
    index = (hawkish - dovish) / total
    return index

def run_sentiment_pipeline(model_name, output_prefix, df_input):
    print(f"\n{'='*50}\nProcessing with model: {model_name}\n{'='*50}")
    
    # Load Model
    # Note: Using user-specified parameters for RoBERTa, defaults for others if needed
    if "roberta" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, do_basic_tokenize=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
        config = AutoConfig.from_pretrained(model_name)
    else:
        # FinBERT
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        config = AutoConfig.from_pretrained(model_name)

    device = 0 if torch.cuda.is_available() else -1
    nlp = pipeline('text-classification', model=model, tokenizer=tokenizer, config=config, device=device, framework="pt")
    
    # Run Inference
    results = []
    batch_size = 32
    sentences = df_input['sentence_text'].tolist()
    
    # Using a list to hold predictions
    predictions = []
    
    for i in tqdm(range(0, len(sentences), batch_size), desc=f"Inference ({output_prefix})"):
        batch = sentences[i:i + batch_size]
        try:
            batch_preds = nlp(batch, truncation=True)
            predictions.extend(batch_preds)
        except Exception as e:
            print(f"Error in batch {i}: {e}")
            # Fallback neutral
            predictions.extend([{'label': 'Neutral', 'score': 0.0}] * len(batch))

    # Align Lengths
    df_run = df_input.copy()
    if len(predictions) != len(df_run):
        df_run = df_run.iloc[:len(predictions)]
        
    # Map Labels
    mapped_results = []
    for pred in predictions:
        label = get_sentiment_label(pred, model_name)
        mapped_results.append({"sentiment": label, "score": pred['score']})
        
    df_results = pd.concat([df_run, pd.DataFrame(mapped_results)], axis=1)
    
    # Calculate Monthly Index
    df_results['date'] = pd.to_datetime(df_results['date'])
    monthly_index = df_results.groupby('date').apply(calculate_index).reset_index(name='sentiment_index')
    monthly_counts = df_results.groupby(['date', 'sentiment']).size().unstack(fill_value=0).reset_index()
    monthly_final = pd.merge(monthly_index, monthly_counts, on='date')
    
    # Save Files
    res_path = fr"e:\Textming\data\{output_prefix}_inference_results.csv"
    idx_path = fr"e:\Textming\data\{output_prefix}_monthly_index.csv"
    
    df_results.to_csv(res_path, index=False)
    monthly_final.to_csv(idx_path, index=False)
    print(f"Saved results to {res_path} and {idx_path}")
    
    return monthly_final

In [None]:
# Execute for FOMC-RoBERTa
# Make sure you are logged in to HuggingFace (Cell 3)
df_roberta = run_sentiment_pipeline(
    model_name="gtfintechlab/FOMC-RoBERTa",
    output_prefix="fomc_roberta",
    df_input=df_original
)


Processing with model: gtfintechlab/FOMC-RoBERTa


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/gtfintechlab/FOMC-RoBERTa.
403 Client Error. (Request ID: Root=1-694b25da-0e0eda5b1d637fd554042bc1;868c570d-7fee-4bd9-94e0-3c56c20b8c69)

Cannot access gated repo for url https://huggingface.co/gtfintechlab/FOMC-RoBERTa/resolve/main/config.json.
Your request to access model gtfintechlab/FOMC-RoBERTa is awaiting a review from the repo authors.

In [None]:
# Execute for FinBERT
df_finbert = run_sentiment_pipeline(
    model_name="ProsusAI/finbert",
    output_prefix="finbert",
    df_input=df_original
)

In [None]:
# Compare Results
plt.figure(figsize=(15, 8))

if 'df_roberta' in locals():
    sns.lineplot(data=df_roberta, x='date', y='sentiment_index', label='FOMC-RoBERTa', marker='o', alpha=0.7)

if 'df_finbert' in locals():
    sns.lineplot(data=df_finbert, x='date', y='sentiment_index', label='FinBERT', marker='x', alpha=0.7)

plt.title('Fed Sentiment Index Comparison: RoBERTa vs FinBERT')
plt.axhline(0, color='black', linestyle='--', alpha=0.3)
plt.ylabel('Sentiment Index (Hawkish - Dovish) / Total')
plt.legend()
plt.show()