In [6]:
# First install required libraries if not already done
!pip install transformers datasets scikit-learn torch pandas matplotlib seaborn jupyter kagglehub

In [7]:


import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Download dataset
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
print("Path to dataset files:", path)

# Define file path
file_path = os.path.join(path, "all-data.csv")

# Read CSV with no header and proper encoding
df = pd.read_csv(file_path, engine='python', sep=',', header=None, encoding='latin-1')

# Check the first few rows to understand column order
print("First 5 rows:")
print(df.head())

# Assign column names manually based on inspection
# Based on public dataset info, it's usually [sentiment, text]
df.columns = ['label', 'text']

# Now filter valid labels
valid_labels = {'positive', 'neutral', 'negative'}
df['label'] = df['label'].str.strip()
df = df[df['label'].isin(valid_labels)]

# Reset index
df.reset_index(drop=True, inplace=True)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Optional: print shapes
print("\nTrain shape:", train_df.shape)
print("Test shape:", test_df.shape)

model_name = "yiyanghkust/finbert-tone"  # FinBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Define your label mappings correctly (only the valid ones)
labels = ['positive', 'neutral', 'negative']

# Create mappings
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

def tokenize_function(examples):
    # Map string labels to integers within the tokenization function
    # Ensure examples['label'] is a string before mapping
    label_str = examples["label"]
    examples["labels"] = label2id[label_str]
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_test = test_dataset.map(tokenize_function, batched=False)

# Remove the original 'label' column as we now have 'labels'
tokenized_train = tokenized_train.remove_columns(["label"])
tokenized_test = tokenized_test.remove_columns(["label"])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels), # Use len(labels) to be dynamic
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="finbert-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("finbert-finetuned-final")
tokenizer.save_pretrained("finbert-finetuned-final")


from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Make predictions
predictions = trainer.predict(tokenized_test)
preds = predictions.predictions.argmax(-1)
true_labels = tokenized_test["labels"]

# Define the list of all possible labelsac
all_labels = list(label2id.values())

# Print report
print(classification_report(true_labels, preds, target_names=label2id.keys(), labels=all_labels, zero_division=0))

# Plot confusion matrix
cm = confusion_matrix(true_labels, preds, labels=all_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label2id.keys())
disp.plot(cmap=plt.cm.Blues)
plt.title("FinBERT Confusion Matrix")
plt.show()


Downloading from https://www.kaggle.com/api/v1/datasets/download/ankurzing/sentiment-analysis-for-financial-news?dataset_version_number=5...


100%|██████████| 903k/903k [00:01<00:00, 782kB/s]

Extracting files...





Path to dataset files: C:\Users\Simanta\.cache\kagglehub\datasets\ankurzing\sentiment-analysis-for-financial-news\versions\5
First 5 rows:
          0                                                  1
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...

Train shape: (3876, 2)
Test shape: (970, 2)


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

label2id: {'positive': 0, 'neutral': 1, 'negative': 2}
id2label: {0: 'positive', 1: 'neutral', 2: 'negative'}


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [8]:
import requests
import pandas as pd
import torch
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Load your custom model
model_path = "finbert-finetuned-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# 2. Fetch news from FMP with proper error handling
def fetch_news(symbol, api_key, days=3730):
    cutoff = datetime.utcnow() - timedelta(days=days)
    all_articles = []
    page = 0

    while True:
        url = f"https://financialmodelingprep.com/api/v3/stock_news?symbol={symbol}&page={page}&apikey={api_key}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            news = response.json()

            if not news:
                break  # No more articles

            # Filter articles by date
            page_articles = []
            for article in news:
                try:
                    article_date = datetime.strptime(article['publishedDate'], "%Y-%m-%d %H:%M:%S")
                    if article_date < cutoff:
                        continue  # Skip articles older than cutoff
                    page_articles.append(article)
                except (KeyError, ValueError):
                    continue

            all_articles.extend(page_articles)

            # Check if we've reached the cutoff date
            if len(page_articles) < len(news):
                break  # This page contained articles beyond cutoff date

            page += 1

            # Safety limit to prevent infinite loops
            if page > 50:  # Max 50 pages (50,000 articles)
                print("Reached maximum page limit")
                break

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break

    print(f"Found {len(all_articles)} articles within {days} days")
    return all_articles

# 3. Analyze sentiment with YOUR model
def analyze_sentiment(articles):
    results = []
    for art in articles:
        try:
            # Use title if content is missing
            text = art.get('content', art.get('title', ''))
            if not text:
                continue

            inputs = tokenizer(text,
                              return_tensors="pt",
                              truncation=True,
                              max_length=512)
            with torch.no_grad():
                logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1)[0]

            results.append({
                'date': datetime.strptime(art['publishedDate'], "%Y-%m-%d %H:%M:%S").date(),
                'sentiment': probs[0].item() - probs[2].item(),  # positive - negative
                'title': art['title'],
                'source': art.get('site', 'Unknown')
            })
        except Exception as e:
            print(f"Error processing article '{art.get('title', '')}': {e}")

    return pd.DataFrame(results)

# 4. Fetch stock prices with error handling
def fetch_prices(symbol, api_key):
    url = f"https://financialmodelingprep.com/api/v3/historical-price-full/{symbol}?apikey={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        # Check if response contains historical data
        if 'historical' not in data:
            print(f"No price data found: {data}")
            return pd.DataFrame()

        prices = pd.DataFrame(data['historical'])
        prices['date'] = pd.to_datetime(prices['date'])
        return prices[['date', 'close']]

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch prices: {e}")
        return pd.DataFrame()

# 5. Main workflow
API_KEY = "API_KEY_HERE"  # Replace with your actual API key
SYMBOL = "AAPL"

# Fetch and process data
print("Fetching news...")
news = fetch_news(SYMBOL, API_KEY, days=3507)
print(f"Found {len(news)} articles")

print("Analyzing sentiment...")
sentiment_df = analyze_sentiment(news)
print(f"Processed {len(sentiment_df)} articles")

print("Fetching stock prices...")
prices_df = fetch_prices(SYMBOL, API_KEY)
print(f"Found {len(prices_df)} price records")

if sentiment_df.empty or prices_df.empty:
    print("Insufficient data for visualization")
else:
    # Aggregate daily sentiment
    daily_sentiment = sentiment_df.groupby('date')['sentiment'].mean().reset_index()
    daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

    # Merge with prices
    merged_df = pd.merge(
        prices_df,
        daily_sentiment,
        on='date',
        how='left'
    )

    # Filter to the period where we have sentiment data
    sentiment_start = daily_sentiment['date'].min()
    sentiment_end = daily_sentiment['date'].max()
    chart_start = sentiment_start - timedelta(days=30)
    chart_end = sentiment_end + timedelta(days=5)

    filtered_df = merged_df[
        (merged_df['date'] >= chart_start) &
        (merged_df['date'] <= chart_end)
    ]

    # Fill missing sentiment with 0 (neutral)
    filtered_df['sentiment'].fillna(0, inplace=True)

    # Add rolling average for sentiment
    filtered_df['sentiment_ma'] = filtered_df['sentiment'].rolling(7, min_periods=1).mean()

    # 6. Visualization - single plot with dual axes
    fig, ax1 = plt.subplots(figsize=(14, 7))

    # Plot stock prices
    ax1.plot(filtered_df['date'], filtered_df['close'], 'b-', linewidth=2, label='Stock Price')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Stock Price', color='b')
    ax1.tick_params('y', colors='b')
    ax1.grid(True, linestyle='--', alpha=0.7)
    ax1.set_title(f'{SYMBOL} Stock Price vs. News Sentiment')

    # Create second axis for sentiment
    ax2 = ax1.twinx()

    # Plot sentiment rolling average
    ax2.plot(filtered_df['date'], filtered_df['sentiment_ma'],
            'r-', linewidth=2,
            label='7-day Sentiment Avg')
    ax2.set_ylabel('Sentiment Score (7-day MA)', color='r')
    ax2.tick_params('y', colors='r')

    # Add horizontal line at 0 for neutral sentiment
    ax2.axhline(0, color='gray', linestyle='--', linewidth=0.8, label='Neutral')

    # Add markers for the actual sentiment period
    ax2.axvline(sentiment_start, color='g', linestyle=':', alpha=0.7, label='Sentiment Start')
    ax2.axvline(sentiment_end, color='r', linestyle=':', alpha=0.7, label='Sentiment End')

    # Combine legends from both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

    plt.tight_layout()
    plt.show()

    # Show sample of the most positive/negative news
    print("\nTop 3 Positive News:")
    print(sentiment_df.nlargest(3, 'sentiment')[['date', 'title', 'sentiment']])

    print("\nTop 3 Negative News:")
    print(sentiment_df.nsmallest(3, 'sentiment')[['date', 'title', 'sentiment']])

OSError: finbert-finetuned-final is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`