In [2]:
# First install required libraries if not already done
!pip install transformers datasets scikit-learn torch pandas matplotlib seaborn jupyter kagglehub

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl.metadata
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/15/fa/c61a787e35f05f17fc10523f567677ec4eeee5f95aa4798dbbbcd9625617/scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl.metadata
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting jupyter
  Obtaining dependency information for jupyter from https://files.pythonhosted.org/packages/38/6


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:


import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Download dataset
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
print("Path to dataset files:", path)

# Define file path
file_path = os.path.join(path, "all-data.csv")

# Read CSV with no header and proper encoding
df = pd.read_csv(file_path, engine='python', sep=',', header=None, encoding='latin-1')

# Check the first few rows to understand column order
print("First 5 rows:")
print(df.head())

# Assign column names manually based on inspection
# Based on public dataset info, it's usually [sentiment, text]
df.columns = ['label', 'text']

# Now filter valid labels
valid_labels = {'positive', 'neutral', 'negative'}
df['label'] = df['label'].str.strip()
df = df[df['label'].isin(valid_labels)]

# Reset index
df.reset_index(drop=True, inplace=True)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Optional: print shapes
print("\nTrain shape:", train_df.shape)
print("Test shape:", test_df.shape)

model_name = "yiyanghkust/finbert-tone"  # FinBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Define your label mappings correctly (only the valid ones)
labels = ['positive', 'neutral', 'negative']

# Create mappings
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

def tokenize_function(examples):
    # Map string labels to integers within the tokenization function
    # Ensure examples['label'] is a string before mapping
    label_str = examples["label"]
    examples["labels"] = label2id[label_str]
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_test = test_dataset.map(tokenize_function, batched=False)

# Remove the original 'label' column as we now have 'labels'
tokenized_train = tokenized_train.remove_columns(["label"])
tokenized_test = tokenized_test.remove_columns(["label"])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels), # Use len(labels) to be dynamic
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="finbert-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("finbert-finetuned-final")
tokenizer.save_pretrained("finbert-finetuned-final")


from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Make predictions
predictions = trainer.predict(tokenized_test)
preds = predictions.predictions.argmax(-1)
true_labels = tokenized_test["labels"]

# Define the list of all possible labelsac
all_labels = list(label2id.values())

# Print report
print(classification_report(true_labels, preds, target_names=label2id.keys(), labels=all_labels, zero_division=0))

# Plot confusion matrix
cm = confusion_matrix(true_labels, preds, labels=all_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label2id.keys())
disp.plot(cmap=plt.cm.Blues)
plt.title("FinBERT Confusion Matrix")
plt.show()


In [None]:
import requests
import pandas as pd
import torch
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Load your custom model
model_path = "finbert-finetuned-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# 2. Fetch news from FMP with proper error handling
def fetch_news(symbol, api_key, days=3730):
    cutoff = datetime.utcnow() - timedelta(days=days)
    all_articles = []
    page = 0

    while True:
        url = f"https://financialmodelingprep.com/api/v3/stock_news?symbol={symbol}&page={page}&apikey={api_key}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            news = response.json()

            if not news:
                break  # No more articles

            # Filter articles by date
            page_articles = []
            for article in news:
                try:
                    article_date = datetime.strptime(article['publishedDate'], "%Y-%m-%d %H:%M:%S")
                    if article_date < cutoff:
                        continue  # Skip articles older than cutoff
                    page_articles.append(article)
                except (KeyError, ValueError):
                    continue

            all_articles.extend(page_articles)

            # Check if we've reached the cutoff date
            if len(page_articles) < len(news):
                break  # This page contained articles beyond cutoff date

            page += 1

            # Safety limit to prevent infinite loops
            if page > 50:  # Max 50 pages (50,000 articles)
                print("Reached maximum page limit")
                break

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break

    print(f"Found {len(all_articles)} articles within {days} days")
    return all_articles

# 3. Analyze sentiment with YOUR model
def analyze_sentiment(articles):
    results = []
    for art in articles:
        try:
            # Use title if content is missing
            text = art.get('content', art.get('title', ''))
            if not text:
                continue

            inputs = tokenizer(text,
                              return_tensors="pt",
                              truncation=True,
                              max_length=512)
            with torch.no_grad():
                logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1)[0]

            results.append({
                'date': datetime.strptime(art['publishedDate'], "%Y-%m-%d %H:%M:%S").date(),
                'sentiment': probs[0].item() - probs[2].item(),  # positive - negative
                'title': art['title'],
                'source': art.get('site', 'Unknown')
            })
        except Exception as e:
            print(f"Error processing article '{art.get('title', '')}': {e}")

    return pd.DataFrame(results)

# 4. Fetch stock prices with error handling
def fetch_prices(symbol, api_key):
    url = f"https://financialmodelingprep.com/api/v3/historical-price-full/{symbol}?apikey={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        # Check if response contains historical data
        if 'historical' not in data:
            print(f"No price data found: {data}")
            return pd.DataFrame()

        prices = pd.DataFrame(data['historical'])
        prices['date'] = pd.to_datetime(prices['date'])
        return prices[['date', 'close']]

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch prices: {e}")
        return pd.DataFrame()

# 5. Main workflow
API_KEY = "API_KEY"  # Replace with your actual API key
SYMBOL = "AAPL"

# Fetch and process data
print("Fetching news...")
news = fetch_news(SYMBOL, API_KEY, days=3507)
print(f"Found {len(news)} articles")

print("Analyzing sentiment...")
sentiment_df = analyze_sentiment(news)
print(f"Processed {len(sentiment_df)} articles")

print("Fetching stock prices...")
prices_df = fetch_prices(SYMBOL, API_KEY)
print(f"Found {len(prices_df)} price records")

if sentiment_df.empty or prices_df.empty:
    print("Insufficient data for visualization")
else:
    # Aggregate daily sentiment
    daily_sentiment = sentiment_df.groupby('date')['sentiment'].mean().reset_index()
    daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

    # Merge with prices
    merged_df = pd.merge(
        prices_df,
        daily_sentiment,
        on='date',
        how='left'
    )

    # Filter to the period where we have sentiment data
    sentiment_start = daily_sentiment['date'].min()
    sentiment_end = daily_sentiment['date'].max()
    chart_start = sentiment_start - timedelta(days=30)
    chart_end = sentiment_end + timedelta(days=5)

    filtered_df = merged_df[
        (merged_df['date'] >= chart_start) &
        (merged_df['date'] <= chart_end)
    ]

    # Fill missing sentiment with 0 (neutral)
    filtered_df['sentiment'].fillna(0, inplace=True)

    # Add rolling average for sentiment
    filtered_df['sentiment_ma'] = filtered_df['sentiment'].rolling(7, min_periods=1).mean()

    # 6. Visualization - single plot with dual axes
    fig, ax1 = plt.subplots(figsize=(14, 7))

    # Plot stock prices
    ax1.plot(filtered_df['date'], filtered_df['close'], 'b-', linewidth=2, label='Stock Price')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Stock Price', color='b')
    ax1.tick_params('y', colors='b')
    ax1.grid(True, linestyle='--', alpha=0.7)
    ax1.set_title(f'{SYMBOL} Stock Price vs. News Sentiment')

    # Create second axis for sentiment
    ax2 = ax1.twinx()

    # Plot sentiment rolling average
    ax2.plot(filtered_df['date'], filtered_df['sentiment_ma'],
            'r-', linewidth=2,
            label='7-day Sentiment Avg')
    ax2.set_ylabel('Sentiment Score (7-day MA)', color='r')
    ax2.tick_params('y', colors='r')

    # Add horizontal line at 0 for neutral sentiment
    ax2.axhline(0, color='gray', linestyle='--', linewidth=0.8, label='Neutral')

    # Add markers for the actual sentiment period
    ax2.axvline(sentiment_start, color='g', linestyle=':', alpha=0.7, label='Sentiment Start')
    ax2.axvline(sentiment_end, color='r', linestyle=':', alpha=0.7, label='Sentiment End')

    # Combine legends from both axes
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

    plt.tight_layout()
    plt.show()

    # Show sample of the most positive/negative news
    print("\nTop 3 Positive News:")
    print(sentiment_df.nlargest(3, 'sentiment')[['date', 'title', 'sentiment']])

    print("\nTop 3 Negative News:")
    print(sentiment_df.nsmallest(3, 'sentiment')[['date', 'title', 'sentiment']])

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# --- Required columns check ---
required_price_cols = {'date', 'close'}
required_sentiment_cols = {'date', 'sentiment'}

if prices_df.empty or sentiment_df.empty:
    raise ValueError("Error: One or both of the input DataFrames (prices_df, sentiment_df) are empty.")

if not required_price_cols.issubset(prices_df.columns):
    raise ValueError(f"Error: prices_df is missing required columns: {required_price_cols - set(prices_df.columns)}")

if not required_sentiment_cols.issubset(sentiment_df.columns):
    raise ValueError(f"Error: sentiment_df is missing required columns: {required_sentiment_cols - set(sentiment_df.columns)}")

# --- Convert date columns and inspect ---
prices_df['date'] = pd.to_datetime(prices_df['date'])
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

print("Latest price date:", prices_df['date'].max().date())
print("Latest sentiment date:", sentiment_df['date'].max().date())

# --- Merge DataFrames ---
sentiment_daily = sentiment_df.groupby('date')['sentiment'].mean().reset_index()
merged_df = pd.merge(prices_df, sentiment_daily, on='date', how='left')
merged_df['sentiment'] = merged_df['sentiment'].fillna(0)

if merged_df.empty:
    raise ValueError("Error: Merged DataFrame is empty.")
if merged_df['close'].isnull().all():
    raise ValueError("Error: All close prices are NaN after merge.")

# --- Compute indicators ---
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0).rolling(period).mean()
    loss = -delta.where(delta < 0, 0).rolling(period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def prepare_df(df):
    df = df.copy()
    df['sentiment_ma'] = df['sentiment'].rolling(7, min_periods=1).mean()
    df['returns'] = df['close'].pct_change()
    df['sma_5'] = df['close'].rolling(5).mean()
    df['rsi'] = compute_rsi(df['close'])
    df.ffill(inplace=True)
    df.fillna(0, inplace=True)
    return df

df = prepare_df(merged_df)
df = df.sort_values('date').reset_index(drop=True)

print("Last date in df before forecast:", df['date'].iloc[-1].date())

# --- Generate features ---
def make_features(df, lookback=5):
    scaler = MinMaxScaler()
    features = ['close', 'sentiment_ma', 'returns', 'sma_5', 'rsi']
    data = scaler.fit_transform(df[features])
    
    X, y, date_out = [], [], []
    df = df.reset_index(drop=True)
    
    for i in range(lookback, len(df)):
        X.append(data[i-lookback:i])
        y.append(data[i, 0])
        date_out.append(df['date'].iloc[i])
        
    return np.array(X), np.array(y), scaler, pd.Series(date_out).reset_index(drop=True)

X, y, scaler, dates = make_features(df)

if len(X) == 0 or len(y) == 0:
    raise ValueError("Error: Not enough data after lookback window.")

# --- LSTM model ---
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden=50):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden, batch_first=True)
        self.fc = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

# --- Training ---
def train_model(X, y):
    X, y = torch.tensor(X).float(), torch.tensor(y).float().view(-1, 1)
    tr_len = int(0.8 * len(X))
    X_tr, y_tr, X_te, y_te = X[:tr_len], y[:tr_len], X[tr_len:], y[tr_len:]
    model = StockLSTM(X.shape[2])
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()
    for e in range(50):
        model.train(); optimizer.zero_grad()
        loss = loss_fn(model(X_tr), y_tr)
        loss.backward(); optimizer.step()
        if e % 10 == 0:
            print(f"Epoch {e}, Loss: {loss.item():.4f}")
    return model, model(X_te).detach().numpy(), y_te.numpy(), X_te

model, preds, actual, X_test = train_model(X, y)

# --- Forecasting ---
def forecast_days(model, seq, days, n_feat, scaler):
    preds = []
    for _ in range(days):
        with torch.no_grad():
            val = model(torch.tensor(seq).float().unsqueeze(0)).item()
        padded = np.array([[val] + [0]*(n_feat-1)])
        unscaled = scaler.inverse_transform(padded)[0,0]
        preds.append(unscaled)
        seq = np.vstack([seq[1:], padded])
    return preds

forecast = forecast_days(model, X[-1], 7, X.shape[2], scaler)
forecast_start = df['date'].iloc[-1] + timedelta(days=1)

print("Forecasting starts from:", forecast_start.date())
print("\nForecast for Next 7 Days:")
for i, val in enumerate(forecast):
    print(f"{(forecast_start + timedelta(days=i)).date()}: ${val:.2f}")

# --- Plotting ---
# SYMBOL is from previous cell
a_inv = scaler.inverse_transform(np.hstack([actual.reshape(-1,1), np.zeros((len(actual), X.shape[2]-1))]))[:,0]
p_inv = scaler.inverse_transform(np.hstack([preds, np.zeros((len(preds), X.shape[2]-1))]))[:,0]

plt.figure(figsize=(12,6))
plt.plot(dates[-len(a_inv):], a_inv, label="Actual")
plt.plot(dates[-len(p_inv):], p_inv, label="Predicted")
plt.title(f"{SYMBOL} - LSTM Forecast (Latest data: {df['date'].iloc[-1].date()})")
plt.legend(); plt.grid(True)
plt.xlabel("Date"); plt.ylabel("Price")
plt.xticks(rotation=45); plt.tight_layout()
plt.show()


In [3]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [None]:


import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Download dataset
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
print("Path to dataset files:", path)

# Define file path
file_path = os.path.join(path, "all-data.csv")

# Read CSV with no header and proper encoding
df = pd.read_csv(file_path, engine='python', sep=',', header=None, encoding='latin-1')

# Check the first few rows to understand column order
print("First 5 rows:")
print(df.head())

# Assign column names manually based on inspection
# Based on public dataset info, it's usually [sentiment, text]
df.columns = ['label', 'text']

# Now filter valid labels
valid_labels = {'positive', 'neutral', 'negative'}
df['label'] = df['label'].str.strip()
df = df[df['label'].isin(valid_labels)]

# Reset index
df.reset_index(drop=True, inplace=True)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Optional: print shapes
print("\nTrain shape:", train_df.shape)
print("Test shape:", test_df.shape)

model_name = "yiyanghkust/finbert-tone"  # FinBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert pandas DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Define your label mappings correctly (only the valid ones)
labels = ['positive', 'neutral', 'negative']

# Create mappings
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print("label2id:", label2id)
print("id2label:", id2label)

def tokenize_function(examples):
    # Map string labels to integers within the tokenization function
    # Ensure examples['label'] is a string before mapping
    label_str = examples["label"]
    examples["labels"] = label2id[label_str]
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_test = test_dataset.map(tokenize_function, batched=False)

# Remove the original 'label' column as we now have 'labels'
tokenized_train = tokenized_train.remove_columns(["label"])
tokenized_test = tokenized_test.remove_columns(["label"])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels), # Use len(labels) to be dynamic
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="finbert-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=18,
    per_device_eval_batch_size=18,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="logs",
    logging_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    optim="adamw_torch",
    lr_scheduler_type="linear",  # 👈 corrected scheduler
    warmup_steps=500,
    seed=42,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,    # ← here
)


trainer.train()
model.save_pretrained("finbert-finetuned-final")
tokenizer.save_pretrained("finbert-finetuned-final")





Downloading from https://www.kaggle.com/api/v1/datasets/download/ankurzing/sentiment-analysis-for-financial-news?dataset_version_number=5...


100%|██████████| 903k/903k [00:00<00:00, 3.37MB/s]

Extracting files...





Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\ankurzing\sentiment-analysis-for-financial-news\versions\5
First 5 rows:
          0                                                  1
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...

Train shape: (3876, 2)
Test shape: (970, 2)


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

label2id: {'positive': 0, 'neutral': 1, 'negative': 2}
id2label: {0: 'positive', 1: 'neutral', 2: 'negative'}


Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]