In [5]:
import yfinance as yf
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.stats import pearsonr


TICKER = "AAPL"
START_DATE = "2024-01-01"
END_DATE = "2024-06-30"


prices_raw = yf.download(
    TICKER,
    start=START_DATE,
    end=END_DATE,
    progress=False
)

prices_raw.columns = [c[0] if isinstance(c, tuple) else c for c in prices_raw.columns]

prices = prices_raw.reset_index()[["Date", "Close"]]
prices["date"] = pd.to_datetime(prices["Date"]).dt.date
prices = prices.drop(columns=["Date"])


news = pd.read_csv("financial_news_events.csv")
news.columns = news.columns.str.lower()

date_col = next(
    c for c in ["date", "published_at", "timestamp", "time"]
    if c in news.columns
)

news["date"] = pd.to_datetime(news[date_col], errors="coerce").dt.date
news = news.dropna(subset=["date"])

headline_col = next(
    c for c in ["headline", "title", "text"]
    if c in news.columns
)

news["headline"] = news[headline_col].astype(str)

if "ticker" in news.columns:
    news = news[news["ticker"] == TICKER]


news_daily = (
    news.groupby("date", as_index=False)["headline"]
    .apply(lambda x: " ".join(x))
)


df = prices.merge(news_daily, on="date", how="left")
df["headline"] = df["headline"].fillna("")


assert isinstance(df.columns, pd.Index)
assert df["Close"].isna().sum() == 0

print(" TASK-1 SUCCESS")
print("Rows:", len(df))
print(df.head())



vader = SentimentIntensityAnalyzer()
df["vader_sentiment"] = df["headline"].apply(
    lambda x: vader.polarity_scores(x)["compound"]
)

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def finbert_score(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)[0]
    return (probs[2] - probs[0]).item()

df["finbert_sentiment"] = df["headline"].apply(finbert_score)

print(" TASK 2 COMPLETE")


df["daily_return"] = df["Close"].pct_change()
df["next_day_return"] = df["daily_return"].shift(-1)
df = df.dropna()

r_vader, p_vader = pearsonr(df["vader_sentiment"], df["next_day_return"])
r_finbert, p_finbert = pearsonr(df["finbert_sentiment"], df["next_day_return"])

print("\nSENTIMENT–PRICE CORRELATION")
print("=" * 50)
print(f"VADER   : r={r_vader:.3f}, p-value={p_vader:.4f}")
print(f"FinBERT : r={r_finbert:.3f}, p-value={p_finbert:.4f}")

def signal(score, threshold=0.3):
    if score > threshold:
        return 1
    elif score < -threshold:
        return -1
    return 0

df["signal_vader"] = df["vader_sentiment"].apply(signal)
df["signal_finbert"] = df["finbert_sentiment"].apply(signal)

print("\nTRADING SIGNALS")
print("=" * 50)
print("VADER Buy:", (df["signal_vader"] == 1).sum())
print("VADER Sell:", (df["signal_vader"] == -1).sum())
print("FinBERT Buy:", (df["signal_finbert"] == 1).sum())
print("FinBERT Sell:", (df["signal_finbert"] == -1).sum())

print("\n WEEK 4 PIPELINE FINISHED")

df_combined = pd.merge(
    prices[["date", "Close"]],
    news_daily,
    on="date",
    how="inner"
)


 TASK-1 SUCCESS
Rows: 124
        Close        date headline
0  183.903229  2024-01-02         
1  182.526230  2024-01-03         
2  180.208130  2024-01-04         
3  179.484940  2024-01-05         
4  183.823959  2024-01-08         
 TASK 2 COMPLETE

SENTIMENT–PRICE CORRELATION
VADER   : r=nan, p-value=nan
FinBERT : r=nan, p-value=nan

TRADING SIGNALS
VADER Buy: 0
VADER Sell: 0
FinBERT Buy: 0
FinBERT Sell: 0

 WEEK 4 PIPELINE FINISHED


  r_vader, p_vader = pearsonr(df["vader_sentiment"], df["next_day_return"])
  r_finbert, p_finbert = pearsonr(df["finbert_sentiment"], df["next_day_return"])


In [6]:
df_combined.to_csv("df_combined_week4.csv", index=False)
print("✅ Week-4 output saved correctly")


✅ Week-4 output saved correctly
