In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm

# === CONFIG ===
PARQUET_INPUT = "financial_news.parquet"     # <- your input file
TEXT_COLUMN = "headline"                     # <- change this if needed
OUTPUT_FILE = "financial_news_with_sentiment.parquet"
BATCH_SIZE = 32

# === Load FinBERT ===
print("Loading FinBERT model...")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# === Load dataset ===
print("Loading data...")
df = pd.read_parquet(PARQUET_INPUT)
texts = df[TEXT_COLUMN].astype(str).tolist()

# === Define batching function ===
def batch_sentiment_analysis(texts, batch_size=32):
    sentiments = []
    confidences = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)

        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = softmax(outputs.logits, dim=1)

        batch_sentiments = torch.argmax(probs, dim=1).tolist()
        batch_confidences = torch.max(probs, dim=1).values.tolist()

        label_map = {0: "negative", 1: "neutral", 2: "positive"}
        sentiments.extend([label_map[s] for s in batch_sentiments])
        confidences.extend(batch_confidences)

    return sentiments, confidences

# === Run analysis ===
print("Analyzing sentiment...")
sentiments, confidences = batch_sentiment_analysis(texts, batch_size=BATCH_SIZE)

# === Save output ===
df["sentiment"] = sentiments
df["confidence"] = confidences
df.to_parquet(OUTPUT_FILE, index=False)

print(f"✅ Sentiment analysis complete! Output saved to: {OUTPUT_FILE}")
