In [None]:
# If your env already has these, you can skip this cell.
# pip install -U transformers datasets scikit-learn accelerate textblob matplotlib pandas numpy
# pip install -U ipywidgets

In [1]:
import os, json, math, random, hashlib, pickle
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Tuple, List

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support, mean_absolute_error
import matplotlib.pyplot as plt

# our wrapper (inference helpers + TextBlob baseline)
from agenticwrapper import (
    LocalAgenticNewsAI,
    load_agent, score_batch_with_agent,
    normalize_articles_df, to_finbert_like_columns
)

# ---------- project paths ----------
NLP_DIR = Path("NLP")
ART_DIR = NLP_DIR / "articles"                  # fetch_articles.py writes here
OUT_DIR = NLP_DIR / "sentiment_scores_agentic"  # new outputs here
MODEL_DIR = NLP_DIR / "models"                  # where checkpoints live
CACHE_DIR = NLP_DIR / "cache"                   # notebook-level caches
DATA_DIR  = NLP_DIR / "data"                    # labeled train/valid CSVs (0=neg,1=neu,2=pos)

OUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)

# ---------- experiment knobs ----------
TICKERS = ["AAPL"]              # change / extend as needed
ENGINE  = "agent"               # "textblob" or "agent"
BASE_MODEL = str(MODEL_DIR / "finbert-finetuned-final")  # or a HF name like "ProsusAI/finbert"
TRAIN_CSV  = str(DATA_DIR / "train.csv")  # must have columns: text, label (0/1/2)
VALID_CSV  = str(DATA_DIR / "valid.csv")
CHECKPOINT_OUT = str(MODEL_DIR / "trained_sentiment_agent")    # supervised output
EVO_ROOT       = str(MODEL_DIR / "evolved_sentiment_agent")    # evolutionary runs

# scoring params
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LENGTH = 256
BATCH_SIZE = 32

# switches (set True only when you want to run training/tuning)
RUN_TRAIN = False         # supervised fine-tune
RUN_EVO   = False         # evolutionary hyperparam search
RUN_BANDIT= False         # threshold tuning on validation set

# label maps
id2label = {0:"negative", 1:"neutral", 2:"positive"}
label2id = {v:k for k,v in id2label.items()}

def seed_everything(seed: int = 42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(42)
print("device:", DEVICE)


ModuleNotFoundError: No module named 'matplotlib.backends.registry'

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)

class SimpleHFDS(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tok, max_len):
        self.texts, self.labels, self.tok, self.max_len = texts, labels, tok, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = self.tok(self.texts[i], truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"][0],
            "attention_mask": enc["attention_mask"][0],
            "labels": torch.tensor(self.labels[i], dtype=torch.long),
        }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {"accuracy": acc, "macro_f1": f1, "precision": prec, "recall": rec}

def finetune_supervised(
    base_model: str,
    train_csv: str,
    valid_csv: str,
    output_dir: str,
    text_col: str = "text",
    label_col: str = "label",
    max_length: int = 256,
    lr: float = 2e-5,
    weight_decay: float = 0.01,
    epochs: int = 3,
    batch_size: int = 16,
    seed: int = 42,
):
    seed_everything(seed)
    tok = AutoTokenizer.from_pretrained(base_model)
    mdl = AutoModelForSequenceClassification.from_pretrained(
        base_model, num_labels=3, id2label=id2label, label2id=label2id
    )

    df_tr = pd.read_csv(train_csv); df_va = pd.read_csv(valid_csv)
    tr_ds = SimpleHFDS(df_tr[text_col].astype(str).tolist(), df_tr[label_col].tolist(), tok, max_length)
    va_ds = SimpleHFDS(df_va[text_col].astype(str).tolist(), df_va[label_col].tolist(), tok, max_length)

    args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=lr, weight_decay=weight_decay,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        seed=seed, logging_steps=50
    )

    trainer = Trainer(
        model=mdl, args=args, train_dataset=tr_ds, eval_dataset=va_ds,
        compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()
    trainer.save_model(output_dir)
    tok.save_pretrained(output_dir)
    print(f"[TRAIN] saved checkpoint -> {output_dir}")
    return output_dir


In [None]:
@dataclass
class EvoConfig:
    population: int = 6
    generations: int = 2
    lr_range: Tuple[float,float] = (1e-5, 5e-5)
    wd_range: Tuple[float,float] = (0.0, 0.1)
    maxlen_choices: Tuple[int,...] = (128, 256, 384)
    batch_choices: Tuple[int,...] = (8, 16, 32)
    seed: int = 13

def _sample_hparams(cfg: EvoConfig):
    lr = 10 ** np.random.uniform(np.log10(cfg.lr_range[0]), np.log10(cfg.lr_range[1]))
    wd = np.random.uniform(cfg.wd_range[0], cfg.wd_range[1])
    L  = random.choice(cfg.maxlen_choices)
    bs = random.choice(cfg.batch_choices)
    return dict(lr=lr, weight_decay=wd, max_length=L, batch_size=bs)

def _valid_macro_f1(checkpoint_dir: str, valid_csv: str, text_col="text", label_col="label",
                    max_length=256, batch_size=32):
    tok, mdl, dev, id2 = load_agent(checkpoint_dir, device=DEVICE)
    df = pd.read_csv(valid_csv)
    texts = df[text_col].astype(str).tolist()
    # run logits->probs
    scores, labels = score_batch_with_agent(
        texts, tok, mdl, dev, id2, max_length=max_length, batch_size=batch_size
    )
    # convert predicted label names to ids
    name2id = {v.lower():k for k,v in id2.items()}
    pred_ids = [name2id[l] for l in labels]
    return f1_score(df[label_col].tolist(), pred_ids, average="macro")

def evolutionary_search(
    base_model: str,
    train_csv: str, valid_csv: str,
    text_col: str = "text", label_col: str = "label",
    evo: EvoConfig = EvoConfig(),
    out_root: str = EVO_ROOT
):
    seed_everything(evo.seed)
    best = {"macro_f1": -1.0, "dir": None, "hp": None}
    for gen in range(evo.generations):
        print(f"[EVO] Generation {gen+1}/{evo.generations}")
        candidates = [_sample_hparams(evo) for _ in range(evo.population)]
        for i, hp in enumerate(candidates):
            out_dir = str(Path(out_root) / f"gen{gen}_cand{i}")
            print(f"  [cand] {hp} -> {out_dir}")
            ckpt = finetune_supervised(
                base_model, train_csv, valid_csv, out_dir,
                text_col=text_col, label_col=label_col,
                max_length=hp["max_length"], lr=hp["lr"], weight_decay=hp["weight_decay"],
                epochs=3, batch_size=hp["batch_size"], seed=evo.seed + gen*10 + i
            )
            f1 = _valid_macro_f1(ckpt, valid_csv, text_col, label_col,
                                  max_length=hp["max_length"], batch_size=hp["batch_size"])
            print(f"    [val] macro-F1 = {f1:.4f}")
            if f1 > best["macro_f1"]:
                best = {"macro_f1": f1, "dir": ckpt, "hp": hp}
    print("[EVO] best:", best)
    return best


In [None]:
def _softmax_np(logits: np.ndarray) -> np.ndarray:
    z = logits - logits.max(axis=-1, keepdims=True)
    e = np.exp(z)
    return e / e.sum(axis=-1, keepdims=True)

def collect_valid_probs(checkpoint_dir: str, valid_csv: str, text_col="text", max_length=256, batch_size=32):
    tok, mdl, dev, id2 = load_agent(checkpoint_dir, device=DEVICE)
    df = pd.read_csv(valid_csv)
    all_probs = []
    for i in range(0, len(df), batch_size):
        chunk = df.iloc[i:i+batch_size]
        enc = tok(chunk[text_col].astype(str).tolist(), padding=True, truncation=True,
                  max_length=max_length, return_tensors="pt")
        enc = {k: v.to(dev) for k,v in enc.items()}
        with torch.inference_mode():
            logits = mdl(**enc).logits.detach().cpu().numpy()
        probs = _softmax_np(logits)
        all_probs.append(probs)
    probs = np.vstack(all_probs)
    return probs, df["label"].tolist(), id2

def bandit_threshold_tuner(
    probs: np.ndarray, labels_true: List[int], id2label: Dict[int,str],
    iters: int = 2000, eps: float = 0.1, seed: int = 7
):
    random.seed(seed); np.random.seed(seed)
    # map prob rows -> continuous scores with the same formula used in inference
    def probs_to_score(p):
        by = {id2label[i].lower(): float(p[i]) for i in range(len(p))}
        return (by.get("positive",0)-by.get("negative",0)) * (1.0 - by.get("neutral",0))
    scores = np.apply_along_axis(probs_to_score, 1, probs)

    grid = np.linspace(-0.9, 0.9, 73)  # candidate thresholds
    best = {"f1": -1.0, "tau_neg": 0.0, "tau_pos": 0.0}
    for _ in range(iters):
        if random.random() < eps:
            tneg = float(np.random.choice(grid))
            tpos = float(np.random.choice(grid))
        else:
            # small exploitation around best thresholds
            tneg = best["tau_neg"] + float(np.random.randn()*0.05)
            tpos = best["tau_pos"] + float(np.random.randn()*0.05)
        pred = np.where(scores >= tpos, 2, np.where(scores <= tneg, 0, 1))
        f1 = f1_score(labels_true, pred, average="macro")
        if f1 > best["f1"]:
            best = {"f1": f1, "tau_neg": tneg, "tau_pos": tpos}
    return best


In [None]:
def hash_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8", errors="ignore")).hexdigest()

def load_cache(path: Path) -> Dict[str, Tuple[float, str]]:
    if path.exists():
        with open(path, "rb") as f:
            return pickle.load(f)
    return {}

def save_cache(cache: Dict[str, Tuple[float, str]], path: Path):
    with open(path, "wb") as f:
        pickle.dump(cache, f)

def score_articles_with_cache(
    texts: List[str],
    model_dir: str,
    cache_path: Path,
    device: str = DEVICE,
    max_length: int = MAX_LENGTH,
    batch_size: int = BATCH_SIZE,
):
    # Use the inference helpers from agenticwrapper, but skip cached items
    tok, mdl, dev, id2 = load_agent(model_dir, device=device)
    cache = load_cache(cache_path)

    scores, labels = [], []
    miss_idx, miss_texts = [], []

    # hit cache where possible
    for i, t in enumerate(texts):
        key = hash_text(t)
        if key in cache:
            s, lb = cache[key]
            scores.append(s); labels.append(lb)
        else:
            scores.append(None); labels.append(None)
            miss_idx.append(i); miss_texts.append(t)

    # score misses in batches
    if miss_texts:
        miss_scores, miss_labels = score_batch_with_agent(
            miss_texts, tok, mdl, dev, id2,
            max_length=max_length, batch_size=batch_size
        )
        for k, (s, lb) in enumerate(zip(miss_scores, miss_labels)):
            i = miss_idx[k]
            scores[i] = s; labels[i] = lb
            cache[hash_text(miss_texts[k])] = (s, lb)

        save_cache(cache, cache_path)

    return scores, labels

def add_finbert_parity_columns(df_articles: pd.DataFrame) -> pd.DataFrame:
    # if TextBlob columns exist, add sentiment_score so downstream plots align
    return to_finbert_like_columns(df_articles)


In [None]:
best_checkpoint_dir = None  # we'll fill this if any of the runs happen

if RUN_TRAIN:
    best_checkpoint_dir = finetune_supervised(
        base_model=BASE_MODEL,
        train_csv=TRAIN_CSV, valid_csv=VALID_CSV,
        output_dir=CHECKPOINT_OUT,
        max_length=MAX_LENGTH, lr=2e-5, weight_decay=0.01,
        epochs=3, batch_size=16, seed=42
    )

if RUN_EVO:
    evo_best = evolutionary_search(
        base_model=BASE_MODEL, train_csv=TRAIN_CSV, valid_csv=VALID_CSV,
        text_col="text", label_col="label",
        evo=EvoConfig(population=6, generations=2),
        out_root=EVO_ROOT
    )
    best_checkpoint_dir = evo_best["dir"]

if RUN_BANDIT:
    assert best_checkpoint_dir, "Bandit tuner needs a trained checkpoint (set RUN_TRAIN or RUN_EVO first)."
    probs, y_true, id2 = collect_valid_probs(
        best_checkpoint_dir, VALID_CSV, text_col="text",
        max_length=MAX_LENGTH, batch_size=BATCH_SIZE
    )
    bandit = bandit_threshold_tuner(probs, y_true, id2, iters=2000, eps=0.1)
    print("[BANDIT] best:", bandit)

# Pick a checkpoint for inference:
AGENT_MODEL_DIR = best_checkpoint_dir or CHECKPOINT_OUT  # or manually set to any saved dir
print("AGENT_MODEL_DIR =", AGENT_MODEL_DIR)


In [None]:
def load_articles_for_ticker(ticker: str) -> pd.DataFrame:
    csv_path = ART_DIR / f"{ticker}.csv"
    if not csv_path.exists():
        raise FileNotFoundError(f"Missing articles CSV: {csv_path}")
    raw = pd.read_csv(csv_path)
    df = normalize_articles_df(raw)  # ensures {date,title,content}
    return df

articles_by_ticker: Dict[str, pd.DataFrame] = {}
for t in TICKERS:
    df = load_articles_for_ticker(t)
    print(f"{t}: {len(df)} articles")
    articles_by_ticker[t] = df


In [None]:
results = {}
for t, df_norm in articles_by_ticker.items():
    agent = LocalAgenticNewsAI()
    agent.memory = df_norm.copy()

    if ENGINE == "textblob":
        agent.analyze_sentiment()
        per_article = add_finbert_parity_columns(agent.memory.copy())  # adds sentiment_score=polarity
        # TextBlob daily:
        daily = (per_article.groupby("date", as_index=False)
                 .agg(mean_score=("sentiment_score","mean"),
                      n_articles=("sentiment_score","size"),
                      pos_share=("sentiment_score", lambda s: float((s>0).mean()))))
    else:
        # agent inference with on-disk cache to avoid rescoring repeats
        cache_path = CACHE_DIR / f"{t}__agent_cache.pkl"
        texts = agent.memory["content"].astype(str).tolist()
        scores, labels = score_articles_with_cache(
            texts, model_dir=str(AGENT_MODEL_DIR), cache_path=cache_path,
            device=DEVICE, max_length=MAX_LENGTH, batch_size=BATCH_SIZE
        )
        per_article = agent.memory.copy()
        per_article["sentiment_score"] = scores
        per_article["sentiment_label"] = labels
        daily = (per_article.groupby("date", as_index=False)
                 .agg(mean_score=("sentiment_score","mean"),
                      n_articles=("sentiment_score","size"),
                      pos_share=("sentiment_label", lambda s: float((s=="positive").mean()))))

    # save outputs
    art_out   = OUT_DIR / f"{t}__agent_articles.csv"
    daily_out = OUT_DIR / f"{t}__agent_daily.csv"
    per_article.to_csv(art_out, index=False)
    daily.to_csv(daily_out, index=False)
    print("Saved:", art_out, "|", daily_out)

    results[t] = {"articles": per_article, "daily": daily}


In [None]:
def maybe_load_finbert_daily(ticker: str) -> Optional[pd.DataFrame]:
    # Try to auto-detect FinBERT daily file (adjust to your exact path if needed)
    candidates = [
        NLP_DIR / "sentiment_scores" / f"{ticker}__daily.csv",
        NLP_DIR / "sentiment_scores" / f"{ticker}_daily.csv",
        NLP_DIR / "sentiment_scores" / f"{ticker}__finbert_daily.csv",
    ]
    for p in candidates:
        if p.exists():
            df = pd.read_csv(p)
            # unify column name
            if "mean_score" not in df.columns and "score" in df.columns:
                df = df.rename(columns={"score":"mean_score"})
            return df
    return None

for t, d in results.items():
    daily = d["daily"].copy()
    daily["date"] = pd.to_datetime(daily["date"])
    plt.figure()
    plt.plot(daily["date"], daily["mean_score"])
    plt.title(f"{t} — Agentic daily mean_score")
    plt.xlabel("Date"); plt.ylabel("mean_score"); plt.grid(True)
    plt.show()

    fb = maybe_load_finbert_daily(t)
    if fb is not None:
        fb["date"] = pd.to_datetime(fb["date"])
        merged = pd.merge(daily[["date","mean_score"]], fb[["date","mean_score"]],
                          on="date", how="inner", suffixes=("_agent","_finbert"))
        if not merged.empty:
            corr = merged["mean_score_agent"].corr(merged["mean_score_finbert"])
            mae  = mean_absolute_error(merged["mean_score_finbert"], merged["mean_score_agent"])
            print(f"[{t}] correlation (agent vs FinBERT): {corr:.3f} | MAE: {mae:.4f}")

            plt.figure()
            plt.plot(merged["date"], merged["mean_score_agent"], label="Agent")
            plt.plot(merged["date"], merged["mean_score_finbert"], label="FinBERT", linestyle="--")
            plt.title(f"{t} — Daily Sentiment (Agent vs FinBERT)")
            plt.xlabel("Date"); plt.ylabel("mean_score"); plt.legend(); plt.grid(True)
            plt.show()
        else:
            print(f"[{t}] No overlapping dates with FinBERT daily file.")
    else:
        print(f"[{t}] FinBERT daily file not found; skipped comparison.")


In [None]:
summary_rows = []
for t, d in results.items():
    daily = d["daily"]
    summary_rows.append({
        "ticker": t,
        "n_days": len(daily),
        "mean_of_meanscore": float(daily["mean_score"].mean()),
        "stdev_of_meanscore": float(daily["mean_score"].std(ddof=0)),
        "avg_articles_per_day": float(daily["n_articles"].mean()),
        "avg_pos_share": float(daily["pos_share"].mean())
    })

summary_df = pd.DataFrame(summary_rows)
summary_path = OUT_DIR / "run_summary.csv"
summary_df.to_csv(summary_path, index=False)
summary_df
