In [12]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm.auto import tqdm
import kagglehub
import os

In [17]:
#get the data
path = kagglehub.dataset_download("frankossai/apple-stock-aapl-historical-financial-news-data")
csvs = [f for f in os.listdir(path) if f.endswith('.csv')]
df = pd.read_csv(os.path.join(path, csvs[0])).set_index('date')['content'].to_frame()

In [18]:
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

def analyze_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", [0.0, 0.0, 1.0] 

    text = text.strip()
    
    chunk_size = 400
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    prob_accumulator = []

    for part in chunks:

        tokenized = tokenizer(part, return_tensors="pt", truncation=True, max_length=512)
        output = model(**tokenized)
        probs = softmax(output.logits, dim=1)[0].tolist()

        if len(probs) != 3:
            raise RuntimeError("Unexpected output shape")

        prob_accumulator.append(probs)


    if not prob_accumulator:
        return "neutral", [0.0, 0.0, 1.0]

    averaged_probs = torch.tensor(prob_accumulator).mean(dim=0).tolist()
    labels = ["positive", "negative", "neutral"]
    sentiment = labels[averaged_probs.index(max(averaged_probs))]

    return sentiment, averaged_probs


all_sentiments = []
all_probabilities = []

print(df.head()) 
print("Total rows pulled:", len(df))

for article_body in tqdm(df['content'], desc="Scoring articles with FinBERT"):
    try:
        sentiment, scores = analyze_sentiment(article_body)
    except Exception as ex:
        print("Error scoring text:", ex)
        sentiment, scores = "neutral", [0.0, 0.0, 1.0]
    
    all_sentiments.append(sentiment)
    all_probabilities.append(scores)

df['sentiment'] = all_sentiments
df['probs'] = all_probabilities

df['sent_score'] = df['probs'].apply(lambda p: p[0] - p[1])


daily_summary = (df.groupby("date")['sent_score'].mean().reset_index().rename(columns={'sent_score': 'daily_sentiment'}))

print(daily_summary.head())


df.to_csv("lseg_body_finbert_output.csv", index=False)
daily_summary.to_csv("lseg_body_daily_sentiment.csv", index=False)




                                                                     content
date                                                                        
2024-11-27T16:39:00+00:00  Warren Buffettâ€™s caution, his advancing age, a...
2024-11-26T00:00:00+00:00                      What Is a Stock Market Index?
2024-11-26T00:00:00+00:00  Could Investing $1,000 in Apple Make You a Mil...
2024-11-26T00:00:00+00:00                       Dow Jones Industrial Average
2024-11-26T00:00:00+00:00                         What Is the S&P 500 Index?
Total rows pulled: 29752


Scoring articles with FinBERT:   0%|          | 0/29752 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:


tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

def analyze_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", [0.0, 0.0, 1.0] 

    text = text.strip()
    
    chunk_size = 400
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    prob_accumulator = []

    for part in chunks:

        tokenized = tokenizer(part, return_tensors="pt", truncation=True, max_length=512)
        output = model(**tokenized)
        probs = softmax(output.logits, dim=1)[0].tolist()

        if len(probs) != 3:
            raise RuntimeError("Unexpected output shape")

        prob_accumulator.append(probs)


    if not prob_accumulator:
        return "neutral", [0.0, 0.0, 1.0]

    averaged_probs = torch.tensor(prob_accumulator).mean(dim=0).tolist()
    labels = ["positive", "negative", "neutral"]
    sentiment = labels[averaged_probs.index(max(averaged_probs))]

    return sentiment, averaged_probs


all_sentiments = []
all_probabilities = []


conn = psycopg2.connect(host="fscstor02.fsc.stevens.edu",port="5432",dbname="machine_readable_news",user="username",password="password")

print("Connected to the database.")


sql = """
SELECT *
FROM (
    SELECT DISTINCT ON (trim(d.headline))
           d.item_id,
           d.first_created,
           d.headline,
           d.body
    FROM item_data d
    WHERE d.first_created BETWEEN '2023-11-01' AND '2024-04-30'
	  AND d.item_language = 'en'
      AND d.item_id IN (SELECT item_id FROM data_subject WHERE subject = 'N2:US')
      AND d.item_id IN (
          SELECT item_id 
          FROM data_subject 
          WHERE subject IN (
              -- could include GDP, inflation, etc. later
              'N2:STX'
          )
      )
    ORDER BY trim(d.headline), d.first_created
) AS filtered_news
ORDER BY filtered_news.first_created;
"""

df = pd.read_sql(sql, conn)

print(df.head()) 
print("Total rows pulled:", len(df))


df = df[['item_id', 'first_created', 'body']].dropna()
df['first_created'] = pd.to_datetime(df['first_created'], utc=True)
df['date'] = df['first_created'].dt.date

for article_body in tqdm(df['body'], desc="Scoring articles with FinBERT"):
    try:
        sentiment, scores = analyze_sentiment(article_body)
    except Exception as ex:
        print("Error scoring text:", ex)
        sentiment, scores = "neutral", [0.0, 0.0, 1.0]
    
    all_sentiments.append(sentiment)
    all_probabilities.append(scores)

df['sentiment'] = all_sentiments
df['probs'] = all_probabilities

df['sent_score'] = df['probs'].apply(lambda p: p[0] - p[1])


daily_summary = (df.groupby("date")['sent_score'].mean().reset_index().rename(columns={'sent_score': 'daily_sentiment'}))

print(daily_summary.head())


df.to_csv("lseg_body_finbert_output.csv", index=False)
daily_summary.to_csv("lseg_body_daily_sentiment.csv", index=False)





if 'conn' in locals() and conn:
    conn.close()
    print("Connection closed.")