In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm.auto import tqdm
import kagglehub
import os
import re

In [3]:
#get the data
path = kagglehub.dataset_download("frankossai/apple-stock-aapl-historical-financial-news-data")
csvs = [f for f in os.listdir(path) if f.endswith('.csv')]
df_mod = pd.read_csv(os.path.join(path, csvs[0])).set_index('date')
df = df_mod['content'].to_frame()[:80]



In [5]:
# get the new data

# Read the CSV
df = pd.read_csv('data_gold.csv')

# Step 1: Filter only rows where headline starts with "PRECIOUS"
df = df[df['headline'].str.startswith('PRECIOUS', na=False)].copy()

# Step 2: Remove "PRECIOUS-" from the headline
df['headline'] = df['headline'].str.replace(r'^PRECIOUS-', '', regex=True)

# Step 3: Function to clean the body text
def clean_body(text):
    if pd.isna(text):
        return text
    
    # First, normalize whitespace (tabs, newlines, etc.) to spaces for processing
    # But we want to keep certain newlines, so let's be more careful
    
    # The synopsis pattern is:
    # - Starts with bullet points (* followed by text)
    # - May have "(Updates with...)" or similar
    # - Has "By Author Name"
    # - Then "Date (Reuters) -" which marks the start of the actual article
    
    # Pattern to match everything before "(Reuters) -" and remove it
    # The article starts after "Date (Reuters) - "
    pattern = r'^.*?\b[A-Z][a-z]{2}\s+\d{1,2}\s+\(Reuters\)\s*-\s*'
    
    cleaned = re.sub(pattern, '', text, flags=re.DOTALL)
    
    return cleaned

# Step 4: Function to flatten whitespace (replace all whitespace with single spaces)
def flatten_whitespace(text):
    if pd.isna(text):
        return text
    # Replace all whitespace (newlines, tabs, multiple spaces) with single space
    return re.sub(r'\s+', ' ', text).strip()

# Apply cleaning to body
df['body'] = df['body'].apply(clean_body)

# Flatten whitespace in all text columns
df['headline'] = df['headline'].apply(flatten_whitespace)
df['body'] = df['body'].apply(flatten_whitespace)

# Convert first_created to datetime to extract year (handle mixed formats)
df['first_created'] = pd.to_datetime(df['first_created'], format='ISO8601', utc=True)
df['year'] = df['first_created'].dt.year

# Save the cleaned CSV
output_path = 'data_gold_cleaned.csv'
df.to_csv(output_path, index=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert", clean_up_tokenization_spaces=True)
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

df = pd.read_csv("data_gold_cleaned.csv")

def analyze_sentiment(text):
    """Analyze sentiment in a single pass without chunking."""
    if not isinstance(text, str) or text.strip() == "":
        return "neutral", [0.0, 0.0, 1.0]

    text = text.strip()
    try:
        # Tokenize once; rely on truncation to fit model max length (512 tokens)
        tokenized = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        output = model(**tokenized)
        probs = softmax(output.logits, dim=1)[0].tolist()

        if len(probs) != 3:
            raise RuntimeError("Unexpected output shape")

        labels = ["positive", "negative", "neutral"]
        sentiment = labels[probs.index(max(probs))]
        return sentiment, probs
    except Exception as ex:
        print("Error in analyze_sentiment:", ex)
        return "neutral", [0.0, 0.0, 1.0]



# Process and save by year to avoid losing progress
all_results = []

for year in sorted(df['year'].unique()):
    year_df = df[df['year'] == year].copy()
    print(f"\nProcessing year {year} ({len(year_df)} articles)")

    year_sentiments = []
    year_probabilities = []

    for article_body in tqdm(year_df['body'], desc=f"Scoring {year} articles with FinBERT"):
        try:
            sentiment, scores = analyze_sentiment(article_body)
        except Exception as ex:
            print("Error scoring text:", ex)
            sentiment, scores = "neutral", [0.0, 0.0, 1.0]

        year_sentiments.append(sentiment)
        year_probabilities.append(scores)

    year_df['sentiment'] = year_sentiments
    year_df['probs'] = year_probabilities
    year_df['sent_score'] = year_df['probs'].apply(lambda p: p[0] - p[1])

    # Save this year's results immediately
    year_output_file = f"sentiment_data_cleaned/data_gold_finbert_{year}.csv"
    year_df.to_csv(year_output_file, index=False)
    print(f"Saved {year} results to {year_output_file}")

    all_results.append(year_df)

# Combine all years and save final complete file
df_complete = pd.concat(all_results, ignore_index=True)
df_complete.to_csv("sentiment_data_cleaned/data_gold_finbert_complete.csv", index=False)
print(f"\n✓ All years processed and saved to data_gold_finbert_complete.csv")


Processing year 2010 (1691 articles)


Scoring 2010 articles with FinBERT:   0%|          | 0/1691 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
df = pd.read_csv("lseg_body_finbert_output.csv")
daily_summary = (df.groupby("first_created")['sent_score'].mean().reset_index().rename(columns={'sent_score': 'daily_sentiment'}))

print(daily_summary.head())


df.to_csv("lseg_body_finbert_output.csv", index=False)
daily_summary.to_csv("lseg_body_daily_sentiment.csv", index=False)


            first_created  daily_sentiment
0  2025-01-02 22:24:34-05         0.864984
1  2025-01-03 04:58:45-05         0.452268
2  2025-01-03 10:15:01-05        -0.138539
3  2025-01-07 10:06:25-05         0.815281
4  2025-01-08 10:15:31-05        -0.440093


In [9]:
df = pd.read_csv("lseg_body_finbert_output.csv")
df.sort_values(by='sent_score').head()

Unnamed: 0,item_id,first_created,headline,body,sentiment,probs,sent_score
55,51335158,2025-01-27 20:20:16-05,Gold holds ground as traders brace for Fed rat...,Gold prices firmed on Tuesday as focus shifted...,negative,"[0.010495555587112904, 0.9727091789245605, 0.0...",-0.962214
56,51339087,2025-01-28 04:52:59-05,Gold stabilises after selloff as wider markets...,"Gold prices held steady on Tuesday, anchored b...",negative,"[0.013177186250686646, 0.969575047492981, 0.01...",-0.956398
53,51332659,2025-01-27 11:13:10-05,Gold retreats as investors liquidate positions...,"Gold prices declined over 1% on Monday, retrea...",negative,"[0.016720503568649292, 0.9661612510681152, 0.0...",-0.949441
54,51333141,2025-01-27 11:13:10-05,Gold retreats as investors liquidate positions...,"Gold prices declined more than 1% on Monday, r...",negative,"[0.01757621020078659, 0.9663547873497009, 0.01...",-0.948779
61,51345918,2025-01-28 22:45:30-05,Gold holds steady as investors eye Fed decisio...,Gold was little changed on Wednesday as market...,negative,"[0.016708968207240105, 0.9617332816123962, 0.0...",-0.945024


In [10]:
df.sort_values(by='sent_score').tail()

Unnamed: 0,item_id,first_created,headline,body,sentiment,probs,sent_score
18,51252194,2025-01-13 21:56:55-05,Trump policy uncertainty lifts gold; US inflat...,"Gold prices gained on Tuesday, buoyed by uncer...",positive,"[0.9136364459991455, 0.06271181255578995, 0.02...",0.850925
0,51197718,2025-01-02 22:24:34-05,Gold set for weekly rise; market awaits Trump'...,Gold edged up on Friday on a softer U.S. dolla...,positive,"[0.9196024537086487, 0.054618239402770996, 0.0...",0.864984
17,51250795,2025-01-13 21:56:55-05,Trump policy uncertainty lifts gold; US inflat...,"Gold prices gained on Tuesday, buoyed by uncer...",positive,"[0.9313398599624634, 0.04638082906603813, 0.02...",0.884959
66,51358621,2025-01-30 05:38:45-05,Safe-haven gold rises on Trump tariff worries,Gold prices rose on Thursday as investors worr...,positive,"[0.9375680685043335, 0.03505799546837807, 0.02...",0.90251
65,51356947,2025-01-30 05:38:45-05,Safe-haven gold rises amid Trump tariff worries,Safe-haven gold prices rose on Thursday as inv...,positive,"[0.9371510744094849, 0.03357581049203873, 0.02...",0.903575


In [5]:
# Check what columns df_mod has and compare original sentiment values
print("df_mod columns:", df_mod.columns.tolist())
print("\ndf_mod shape:", df_mod.shape)
print("\ndf_mod sample with all columns:")
print(df_mod.head())

if 'sentiment' in df_mod.columns:
    print("\n--- Comparison: Original sentiment vs FinBERT output ---")
    comparison = df.copy()
    comparison['original_sentiment'] = df_mod.loc[df.index, 'sentiment']
    print(comparison[['content', 'original_sentiment', 'sentiment', 'sent_score']].head(10))

df_mod columns: ['title', 'content', 'link', 'symbols', 'tags', 'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']

df_mod shape: (29752, 9)

df_mod sample with all columns:
                                                                       title  \
date                                                                           
2024-11-27T16:39:00+00:00  Berkshire Stock Hits Record Even as Company Re...   
2024-11-26T00:00:00+00:00                      What Is a Stock Market Index?   
2024-11-26T00:00:00+00:00  Could Investing $1,000 in Apple Make You a Mil...   
2024-11-26T00:00:00+00:00                       Dow Jones Industrial Average   
2024-11-26T00:00:00+00:00                         What Is the S&P 500 Index?   

                                                                     content  \
date                                                                           
2024-11-27T16:39:00+00:00  Warren Buffett’s caution, his advancing age, a...   
202

In [45]:
# First, check what columns df actually has
print("df columns:", df.columns.tolist())
print("df shape:", df.shape)
print("\ndf.head():")
print(df.head())

if 'sentiment' not in df.columns:
    print("\n⚠️  WARNING: df doesn't have sentiment columns. Rerun cell 3 to compute sentiment scores.")

df columns: ['content']
df shape: (80, 1)

df.head():
                                                                     content
date                                                                        
2024-11-27T16:39:00+00:00  Warren Buffett’s caution, his advancing age, a...
2024-11-26T00:00:00+00:00                      What Is a Stock Market Index?
2024-11-26T00:00:00+00:00  Could Investing $1,000 in Apple Make You a Mil...
2024-11-26T00:00:00+00:00                       Dow Jones Industrial Average
2024-11-26T00:00:00+00:00                         What Is the S&P 500 Index?



In [None]:
# Check what the Kaggle dataset's sentiment values look like for the first 80 rows
print("=== Kaggle Dataset Sentiment Distribution (first 80 rows) ===\n")
kaggle_80 = df_mod.iloc[:80]

print(f"sentiment_polarity stats:")
print(f"  Mean: {kaggle_80['sentiment_polarity'].mean():.4f}")
print(f"  Min: {kaggle_80['sentiment_polarity'].min():.4f}")
print(f"  Max: {kaggle_80['sentiment_polarity'].max():.4f}")
print(f"  Median: {kaggle_80['sentiment_polarity'].median():.4f}")

print(f"\nsentiment_pos stats:")
print(f"  Mean: {kaggle_80['sentiment_pos'].mean():.4f}")
print(f"  Max: {kaggle_80['sentiment_pos'].max():.4f}")

print(f"\nsentiment_neg stats:")
print(f"  Mean: {kaggle_80['sentiment_neg'].mean():.4f}")
print(f"  Max: {kaggle_80['sentiment_neg'].max():.4f}")

print(f"\nsentiment_neu stats:")
print(f"  Mean: {kaggle_80['sentiment_neu'].mean():.4f}")
print(f"  Min: {kaggle_80['sentiment_neu'].min():.4f}")

# Determine label by max score
kaggle_80_copy = kaggle_80.copy()
kaggle_80_copy['label'] = kaggle_80[['sentiment_pos', 'sentiment_neg', 'sentiment_neu']].idxmax(axis=1).str.replace('sentiment_', '')
print("\nKaggle sentiment label distribution:")
print(kaggle_80_copy['label'].value_counts())

print("\n=== Sample articles with their Kaggle sentiment ===")
print(kaggle_80_copy[['title', 'sentiment_polarity', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu', 'label']].head(10))