In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
df_combined = pd.read_csv("df_combined_week4.csv")

# ---- Normalize date column ----
if "date" not in df_combined.columns:
    raise ValueError("❌ Week-4 must contain a 'date' column")

df_combined["date"] = pd.to_datetime(df_combined["date"], errors="coerce")

df_combined = df_combined.sort_values("date").reset_index(drop=True)

print("✅ df_combined loaded")
print("Columns:", df_combined.columns.tolist())

# =========================
# VADER (IF MISSING)
# =========================

if "vader_sentiment" not in df_combined.columns:
    vader = SentimentIntensityAnalyzer()
    df_combined["vader_sentiment"] = df_combined["headline"].fillna("").apply(
        lambda x: vader.polarity_scores(x)["compound"]
    )

if "signal_vader" not in df_combined.columns:
    df_combined["signal_vader"] = np.where(
        df_combined["vader_sentiment"] > 0.05, 1,
        np.where(df_combined["vader_sentiment"] < -0.05, -1, 0)
    )

# =========================
# LOGISTIC REGRESSION (IF MISSING)
# =========================

if "lr_sentiment" not in df_combined.columns:
    tfidf = TfidfVectorizer(max_features=3000)
    X = tfidf.fit_transform(df_combined["headline"].fillna(""))

    y_proxy = (df_combined["vader_sentiment"] > 0).astype(int)

    lr = LogisticRegression(max_iter=1000)
    lr.fit(X, y_proxy)

    df_combined["lr_sentiment"] = lr.predict_proba(X)[:, 1]

if "signal_lr" not in df_combined.columns:
    df_combined["signal_lr"] = np.where(
        df_combined["lr_sentiment"] > 0.55, 1,
        np.where(df_combined["lr_sentiment"] < 0.45, -1, 0)
    )

# =========================
# FINBERT (IF MISSING)
# =========================

if "finbert_sentiment" not in df_combined.columns:
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
    model.eval()

    def finbert_score(text):
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=512
        )
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        return probs[0][2].item()

    df_combined["finbert_sentiment"] = df_combined["headline"].fillna("").apply(finbert_score)

if "signal_finbert" not in df_combined.columns:
    df_combined["signal_finbert"] = np.where(
        df_combined["finbert_sentiment"] > 0.6, 1,
        np.where(df_combined["finbert_sentiment"] < 0.4, -1, 0)
    )

# =========================
# FINAL VALIDATION
# =========================

required_cols = {
    "date", "Close",
    "vader_sentiment", "signal_vader",
    "lr_sentiment", "signal_lr",
    "finbert_sentiment", "signal_finbert"
}

missing = required_cols - set(df_combined.columns)
if missing:
    raise ValueError(f"❌ Still missing columns: {missing}")

print("✅ WEEK-5 READY — ALL FEATURES PRESENT")
print("Final shape:", df_combined.shape)


✅ df_combined loaded
Columns: ['date', 'Close', 'headline']


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [13]:
class BacktestEngine:
    def __init__(self, initial_capital=100000, transaction_cost=0.001):
        self.initial_capital = initial_capital
        self.transaction_cost = transaction_cost
        self.cash = initial_capital
        self.shares = 0
        self.trades = []

    def process_signal(self, date, signal, price, sentiment):
        if signal == 1 and self.cash > 0:
            cost = self.cash * self.transaction_cost
            invest = self.cash - cost
            self.shares = invest / price
            self.cash = 0
            self.trades.append((date, "BUY", price, sentiment))

        elif signal == -1 and self.shares > 0:
            proceeds = self.shares * price
            cost = proceeds * self.transaction_cost
            self.cash = proceeds - cost
            self.shares = 0
            self.trades.append((date, "SELL", price, sentiment))

    def run_backtest(self, df, signal_col, sentiment_col):
        records = []

        for _, row in df.iterrows():
            self.process_signal(
                row["date"],
                row[signal_col],
                row["Close"],
                row[sentiment_col]
            )
            value = self.cash + self.shares * row["Close"]
            records.append((row["date"], value))

        return pd.DataFrame(records, columns=["date", "portfolio_value"])

    def calculate_metrics(self, df):
        returns = df["portfolio_value"].pct_change().dropna()
        total_return = (df.iloc[-1, 1] / self.initial_capital - 1) * 100
        sharpe = (returns.mean() / returns.std()) * np.sqrt(252)
        drawdown = (df["portfolio_value"] / df["portfolio_value"].cummax() - 1).min() * 100

        return {
            "total_return": total_return,
            "sharpe_ratio": sharpe,
            "max_drawdown": drawdown,
            "num_trades": len(self.trades)
        }


In [14]:
bt_vader = BacktestEngine()
res_vader = bt_vader.run_backtest(df_combined, "signal_vader", "vader_sentiment")
metrics_vader = bt_vader.calculate_metrics(res_vader)

bt_lr = BacktestEngine()
res_lr = bt_lr.run_backtest(df_combined, "signal_lr", "lr_sentiment")
metrics_lr = bt_lr.calculate_metrics(res_lr)

bt_finbert = BacktestEngine()
res_finbert = bt_finbert.run_backtest(df_combined, "signal_finbert", "finbert_sentiment")
metrics_finbert = bt_finbert.calculate_metrics(res_finbert)

print("✅ ALL STRATEGIES BACKTESTED SUCCESSFULLY")


IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
comparison = pd.DataFrame({
    "VADER": metrics_vader,
    "Logistic Regression": metrics_lr,
    "FinBERT": metrics_finbert
}).T

print("\nSTRATEGY COMPARISON")
print("="*60)
print(comparison.round(2))
