# 04 - All-Novel Stacked Twist Signal Visuals

This notebook generates **20 per-novel stacked figures** where each figure contains 3 vertical panels for `k=5,7,11`, plotting:
- `s_t` (Twist Signal)
- `a_t` (Twist acceleration)

It also exports:
- clean grouped outputs under `outputs/eda/novel_stacks/`
- summary tables for stats/highlights/manifest
- consolidated interpretation markdown at `docs/NOVEL_STACKED_OUTPUT_INTERPRETATION.md`


In [None]:
# Install required packages if missing
import importlib
import subprocess
import sys

REQUIRED_PACKAGES = [
    ("pandas", "pandas"),
    ("numpy", "numpy"),
    ("matplotlib", "matplotlib"),
]

for module_name, pip_name in REQUIRED_PACKAGES:
    try:
        importlib.import_module(module_name)
    except ImportError:
        print(f"Installing {pip_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name])

print("Dependency check complete.")


## 1) Setup

In [None]:
from pathlib import Path
import json
import textwrap
import re

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

try:
    from IPython.display import display
except Exception:
    def display(x):
        print(x)

SEED = 42
np.random.seed(SEED)

K_VALUES = [5, 7, 11]
PRIMARY_K = 7
FIGSIZE = (16, 14)

PROJECT_ROOT = Path(".").resolve()
DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
METADATA_PATH = DATA_DIR / "metadata.csv"

OUTPUT_ROOT = PROJECT_ROOT / "outputs" / "eda" / "novel_stacks"
FIG_DIR = OUTPUT_ROOT / "figures"
TABLE_DIR = OUTPUT_ROOT / "tables"
DOC_PATH = PROJECT_ROOT / "docs" / "NOVEL_STACKED_OUTPUT_INTERPRETATION.md"

for d in [OUTPUT_ROOT, FIG_DIR, TABLE_DIR, DOC_PATH.parent]:
    d.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Managed output root: {OUTPUT_ROOT}")
print(f"Interpretation doc path: {DOC_PATH}")


## 2) Clean Managed Output Folder

In [None]:
def clean_managed_output_dir(fig_dir: Path, table_dir: Path):
    removed = []
    for d in [fig_dir, table_dir]:
        for p in d.glob("*"):
            if p.is_file():
                p.unlink()
                removed.append(str(p))
    return removed

removed_files = clean_managed_output_dir(FIG_DIR, TABLE_DIR)
print(f"Removed {len(removed_files)} managed files from previous run.")


## 3) Load and Validate Inputs

In [None]:
metadata_df = pd.read_csv(METADATA_PATH)
if "id" not in metadata_df.columns and "pg_id" in metadata_df.columns:
    metadata_df["id"] = metadata_df["pg_id"]

required_cols = ["id", "title", "processed_dir"]
missing_cols = [c for c in required_cols if c not in metadata_df.columns]
if missing_cols:
    raise ValueError(f"metadata.csv missing required columns: {missing_cols}")

metadata_df = metadata_df.sort_values("id").reset_index(drop=True)
unique_books = metadata_df["id"].nunique()
if unique_books != 20:
    raise RuntimeError(f"Expected 20 unique books, found {unique_books}")

novel_payloads = []

for row in metadata_df.to_dict(orient="records"):
    book_id = int(row["id"])
    title = str(row["title"])
    processed_dir = str(row["processed_dir"])
    base = PROCESSED_DIR / processed_dir

    if not base.exists():
        raise FileNotFoundError(f"Processed folder missing for book {book_id}: {base}")

    by_k = {}
    lengths = []

    for k in K_VALUES:
        signal_path = base / f"signals_k{k}.npz"
        if not signal_path.exists():
            raise FileNotFoundError(f"Missing signal file: {signal_path}")

        npz = np.load(signal_path)
        if "s" not in npz.files or "a" not in npz.files:
            raise RuntimeError(f"signals_k{k}.npz must contain keys 's' and 'a' for book {book_id}")

        s = np.asarray(npz["s"], dtype=np.float32)
        a = np.asarray(npz["a"], dtype=np.float32)

        if s.shape[0] != a.shape[0] or s.shape[0] == 0:
            raise RuntimeError(f"Invalid signal lengths for book {book_id}, k={k}: len(s)={len(s)}, len(a)={len(a)}")
        if not np.isfinite(s).all() or not np.isfinite(a).all():
            raise RuntimeError(f"Non-finite values in signals for book {book_id}, k={k}")

        T = int(s.shape[0])
        t = np.arange(T, dtype=np.int32)
        t_norm = t / (T - 1) if T > 1 else np.zeros(T, dtype=np.float32)

        peaks_path = base / f"peaks_k{k}.json"
        if peaks_path.exists():
            peaks_obj = json.loads(peaks_path.read_text(encoding="utf-8"))
            peak_indices = [int(x) for x in peaks_obj.get("peak_indices", [])]
        else:
            peak_indices = []

        peak_indices = [idx for idx in peak_indices if 0 <= idx < T]
        peak_pos_norm = [float(idx / (T - 1)) if T > 1 else 0.0 for idx in peak_indices]

        by_k[k] = {
            "s": s,
            "a": a,
            "T": T,
            "chunk_index": t,
            "t_norm": t_norm,
            "peak_indices": peak_indices,
            "peak_pos_norm": peak_pos_norm,
        }
        lengths.append(T)

    if len(set(lengths)) != 1:
        raise RuntimeError(f"Chunk lengths differ across k for book {book_id}: {lengths}")

    novel_payloads.append({
        "book_id": book_id,
        "title": title,
        "processed_dir": processed_dir,
        "signals": by_k,
    })

print(f"Validated books: {len(novel_payloads)}")
print("Sample books:", [(n["book_id"], n["processed_dir"]) for n in novel_payloads[:5]])


## 4) Extract Stats, Manifest, and Highlights Tables

In [None]:
def pad_peak_positions(positions, target=3):
    out = list(positions[:target])
    while len(out) < target:
        out.append(-1.0)
    return out

stats_rows = []
manifest_rows = []

for novel in novel_payloads:
    book_id = novel["book_id"]
    title = novel["title"]
    processed_dir = novel["processed_dir"]

    fig_name = f"novel_{book_id}_{processed_dir}_stacked_k5_k7_k11.png"
    fig_rel = f"outputs/eda/novel_stacks/figures/{fig_name}"

    manifest_rows.append({
        "book_id": book_id,
        "title": title,
        "processed_dir": processed_dir,
        "figure_path": fig_rel,
        "T_k5": int(novel["signals"][5]["T"]),
        "T_k7": int(novel["signals"][7]["T"]),
        "T_k11": int(novel["signals"][11]["T"]),
    })

    for k in K_VALUES:
        sig = novel["signals"][k]
        s = sig["s"]
        a = sig["a"]
        T = sig["T"]

        peak_pos = pad_peak_positions(sig["peak_pos_norm"], target=3)

        stats_rows.append({
            "book_id": int(book_id),
            "title": title,
            "processed_dir": processed_dir,
            "k": int(k),
            "T": int(T),
            "mean_s": float(np.mean(s)),
            "std_s": float(np.std(s)),
            "max_s": float(np.max(s)),
            "mean_a": float(np.mean(a)),
            "std_a": float(np.std(a)),
            "max_a": float(np.max(a)),
            "num_peaks": int(len(sig["peak_indices"])),
            "peak_pos_1": float(peak_pos[0]),
            "peak_pos_2": float(peak_pos[1]),
            "peak_pos_3": float(peak_pos[2]),
        })

stats_df = pd.DataFrame(stats_rows).sort_values(["book_id", "k"]).reset_index(drop=True)
manifest_df = pd.DataFrame(manifest_rows).sort_values("book_id").reset_index(drop=True)

stats_path = TABLE_DIR / "novel_stacked_stats.csv"
manifest_path = TABLE_DIR / "novel_stacked_manifest.csv"

stats_df.to_csv(stats_path, index=False)
manifest_df.to_csv(manifest_path, index=False)

k5 = stats_df[stats_df["k"] == 5][["book_id", "mean_s", "max_a"]].rename(columns={"mean_s": "mean_s_k5", "max_a": "max_a_k5"})
k7 = stats_df[stats_df["k"] == 7][["book_id", "mean_s", "max_a"]].rename(columns={"mean_s": "mean_s_k7", "max_a": "max_a_k7"})
k11 = stats_df[stats_df["k"] == 11][["book_id", "mean_s", "max_a"]].rename(columns={"mean_s": "mean_s_k11", "max_a": "max_a_k11"})

highlights_df = (
    metadata_df[["id", "title", "processed_dir", "genre_primary", "format", "twist_peak_rank"]]
    .rename(columns={"id": "book_id"})
    .merge(k5, on="book_id", how="left")
    .merge(k7, on="book_id", how="left")
    .merge(k11, on="book_id", how="left")
)

highlights_df["delta_mean_s_k11_k5"] = highlights_df["mean_s_k11"] - highlights_df["mean_s_k5"]
highlights_df["delta_max_a_k11_k5"] = highlights_df["max_a_k11"] - highlights_df["max_a_k5"]

highlights_df["rank_mean_s_k7_desc"] = highlights_df["mean_s_k7"].rank(ascending=False, method="min").astype(int)
highlights_df["rank_max_a_k7_desc"] = highlights_df["max_a_k7"].rank(ascending=False, method="min").astype(int)

highlights_df["is_top3_mean_s_k7"] = highlights_df["rank_mean_s_k7_desc"] <= 3
highlights_df["is_top3_max_a_k7"] = highlights_df["rank_max_a_k7_desc"] <= 3

highlights_df = highlights_df.sort_values("book_id").reset_index(drop=True)
highlights_path = TABLE_DIR / "novel_stacked_highlights.csv"
highlights_df.to_csv(highlights_path, index=False)

print(f"Saved {stats_path}")
print(f"Saved {manifest_path}")
print(f"Saved {highlights_path}")
print(f"Stats rows: {len(stats_df)} | Manifest rows: {len(manifest_df)} | Highlight rows: {len(highlights_df)}")

display(highlights_df.head(8))


## 5) Generate 20 Stacked Figures (k=5,7,11)

In [None]:
def build_figure_filename(book_id: int, processed_dir: str):
    safe_dir = re.sub(r"[^a-zA-Z0-9_\-]+", "_", processed_dir).strip("_")
    return f"novel_{book_id}_{safe_dir}_stacked_k5_k7_k11.png"


def subplot_title(k: int, s: np.ndarray, a: np.ndarray, peak_count: int, T: int):
    return (
        f"k={k} | T={T} | mean_s={np.mean(s):.3f} max_s={np.max(s):.3f} | "
        f"mean_a={np.mean(a):.3f} max_a={np.max(a):.3f} | peaks={peak_count}"
    )

saved_fig_paths = []

for novel in novel_payloads:
    book_id = novel["book_id"]
    title = novel["title"]
    processed_dir = novel["processed_dir"]

    fig_name = build_figure_filename(book_id, processed_dir)
    fig_path = FIG_DIR / fig_name

    # Keep a shared y-range within a book for k-to-k comparability.
    all_values = []
    for k in K_VALUES:
        all_values.extend(novel["signals"][k]["s"].tolist())
        all_values.extend(novel["signals"][k]["a"].tolist())
    y_min = float(np.min(all_values))
    y_max = float(np.max(all_values))
    margin = 0.05 * (y_max - y_min if y_max > y_min else 1.0)
    y_low, y_high = y_min - margin, y_max + margin

    fig, axes = plt.subplots(nrows=3, ncols=1, figsize=FIGSIZE, sharex=True)

    for ax, k in zip(axes, K_VALUES):
        data = novel["signals"][k]
        t = data["chunk_index"]
        s = data["s"]
        a = data["a"]
        peak_indices = data["peak_indices"]

        ax.plot(t, s, color="#1f77b4", linewidth=1.6, label="s_t (Twist Signal)")
        ax.plot(t, a, color="#d62728", linewidth=1.2, alpha=0.9, label="a_t (Twist Acceleration)")

        if peak_indices:
            ax.scatter(
                np.array(peak_indices, dtype=int),
                a[np.array(peak_indices, dtype=int)],
                color="#2ca02c",
                s=28,
                zorder=4,
                label="Detected Peaks",
            )

        ax.set_ylim(y_low, y_high)
        ax.set_ylabel("Signal")
        ax.set_title(subplot_title(k, s, a, len(peak_indices), data["T"]), fontsize=11)
        ax.grid(alpha=0.28)

    axes[-1].set_xlabel("Chunk Index")
    axes[0].legend(loc="upper right", fontsize=9)

    fig.suptitle(f"{book_id} | {title} | processed_dir={processed_dir}", fontsize=14)
    fig.tight_layout(rect=[0, 0.02, 1, 0.97])
    fig.savefig(fig_path, dpi=180, bbox_inches="tight")
    plt.close(fig)

    saved_fig_paths.append(fig_path)

print(f"Saved stacked figures: {len(saved_fig_paths)}")
print("Example:", saved_fig_paths[0] if saved_fig_paths else "<none>")


## 6) Generate Consolidated Interpretation Markdown

In [None]:
k7_stats = stats_df[stats_df["k"] == PRIMARY_K].copy().sort_values("book_id").reset_index(drop=True)
report_df = highlights_df.merge(
    k7_stats[["book_id", "num_peaks", "peak_pos_1", "peak_pos_2", "peak_pos_3", "std_a"]],
    on="book_id",
    how="left",
)

# Quantile thresholds for deterministic, relative labeling.
q_mean_lo = report_df["mean_s_k7"].quantile(0.33)
q_mean_hi = report_df["mean_s_k7"].quantile(0.67)
q_acc_lo = report_df["max_a_k7"].quantile(0.33)
q_acc_hi = report_df["max_a_k7"].quantile(0.67)

q_delta_mean_abs = report_df["delta_mean_s_k11_k5"].abs().quantile(0.40)
q_delta_maxa_abs = report_df["delta_max_a_k11_k5"].abs().quantile(0.40)


def classify_level(v, low_thr, high_thr):
    if v <= low_thr:
        return "Low"
    if v >= high_thr:
        return "High"
    return "Medium"


def classify_peak_timing(row):
    vals = [row["peak_pos_1"], row["peak_pos_2"], row["peak_pos_3"]]
    vals = [float(v) for v in vals if float(v) >= 0]
    if not vals:
        return "No detected top-3 peaks"

    vmin, vmax, vmean = min(vals), max(vals), float(np.mean(vals))
    if vmin < 0.33 and vmax > 0.66:
        return f"Distributed across early-to-late arc (avg={vmean:.2f})"
    if vmean < 0.33:
        return f"Early-weighted (avg={vmean:.2f})"
    if vmean < 0.66:
        return f"Mid-story weighted (avg={vmean:.2f})"
    return f"Late-weighted (avg={vmean:.2f})"


def classify_k_dependence(row):
    dm = float(row["delta_mean_s_k11_k5"])
    da = float(row["delta_max_a_k11_k5"])

    if abs(dm) <= q_delta_mean_abs and abs(da) <= q_delta_maxa_abs:
        return "Stable across k"
    if dm > q_delta_mean_abs and da > q_delta_maxa_abs:
        return "Increasing with larger k"
    if dm < -q_delta_mean_abs and da < -q_delta_maxa_abs:
        return "Decreasing with larger k"
    return "Mixed sensitivity across k"


def standout_sentence(novelty_level, accel_level, k_pattern, row):
    title = row["title"]
    if novelty_level == "High" and accel_level == "High":
        return f"{title} combines high novelty and sharp shifts, forming a jagged trajectory profile."
    if novelty_level == "Low" and accel_level == "Low":
        return f"{title} shows a smoother, lower-volatility progression relative to the corpus."
    if novelty_level == "High" and accel_level != "High":
        return f"{title} is consistently novel but less spike-driven than the most volatile books."
    if novelty_level == "Low" and accel_level == "High":
        return f"{title} has a calmer baseline punctuated by concentrated bursts of change."
    return f"{title} sits in a middle regime with {k_pattern.lower()} behavior across window sizes."


# Global highlights
k7_sorted_mean_desc = report_df.sort_values("mean_s_k7", ascending=False)
k7_sorted_mean_asc = report_df.sort_values("mean_s_k7", ascending=True)
k7_sorted_maxa_desc = report_df.sort_values("max_a_k7", ascending=False)

sensitivity_mean = report_df.reindex(report_df["delta_mean_s_k11_k5"].abs().sort_values(ascending=False).index)
sensitivity_maxa = report_df.reindex(report_df["delta_max_a_k11_k5"].abs().sort_values(ascending=False).index)

lines = []
lines.append("# Novel Stacked Twist Signal Interpretation")
lines.append("")
lines.append("## Overview")
lines.append("This document interprets 20 stacked per-novel plots generated from `k=5,7,11`. Each figure has three vertical panels for one novel, and each panel overlays `s_t` (Twist Signal) and `a_t` (Twist Acceleration).")
lines.append("")
lines.append("How to read each panel:")
lines.append("- `s_t` tracks novelty versus recent narrative context.")
lines.append("- `a_t` tracks local novelty acceleration between consecutive chunks.")
lines.append("- Peak markers indicate top acceleration points for that `k`.")
lines.append("")
lines.append("## Global Highlights")
lines.append("")
lines.append("Top 3 by novelty (`mean_s`, k=7):")
for _, r in k7_sorted_mean_desc.head(3).iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | mean_s_k7={r['mean_s_k7']:.3f}")
lines.append("")
lines.append("Lowest 3 by novelty (`mean_s`, k=7):")
for _, r in k7_sorted_mean_asc.head(3).iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | mean_s_k7={r['mean_s_k7']:.3f}")
lines.append("")
lines.append("Top 3 by acceleration spikes (`max_a`, k=7):")
for _, r in k7_sorted_maxa_desc.head(3).iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | max_a_k7={r['max_a_k7']:.3f}")
lines.append("")
lines.append("Strongest k-sensitivity (`|delta_mean_s_k11_k5|`):")
for _, r in sensitivity_mean.head(3).iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | delta_mean_s_k11_k5={r['delta_mean_s_k11_k5']:.3f}")
lines.append("")
lines.append("Strongest k-sensitivity (`|delta_max_a_k11_k5|`):")
for _, r in sensitivity_maxa.head(3).iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | delta_max_a_k11_k5={r['delta_max_a_k11_k5']:.3f}")
lines.append("")
lines.append("Caveats:")
lines.append("- Labels are relative to this 20-book corpus and current embedding/signal settings.")
lines.append("- Peak extraction uses top-3 acceleration peaks and minimum separation defaults from the pipeline.")
lines.append("- Interpretive statements are descriptive and should be validated with additional settings/checks.")
lines.append("")
lines.append("## Per-Novel Interpretations")
lines.append("")

for _, row in report_df.sort_values("book_id").iterrows():
    book_id = int(row["book_id"])
    title = str(row["title"])
    processed_dir = str(row["processed_dir"])

    fig_name = f"novel_{book_id}_{processed_dir}_stacked_k5_k7_k11.png"
    fig_rel = f"../outputs/eda/novel_stacks/figures/{fig_name}"

    novelty_level = classify_level(float(row["mean_s_k7"]), q_mean_lo, q_mean_hi)
    accel_level = classify_level(float(row["max_a_k7"]), q_acc_lo, q_acc_hi)
    peak_profile = classify_peak_timing(row)
    k_pattern = classify_k_dependence(row)
    standout = standout_sentence(novelty_level, accel_level, k_pattern, row)

    lines.append(f"### [{book_id}] {title}")
    lines.append("")
    lines.append(f"![{title} stacked Twist Signal](%s)" % fig_rel)
    lines.append("")
    lines.append(f"- Novelty level (k=7 mean_s={row['mean_s_k7']:.3f}): **{novelty_level}**")
    lines.append(f"- Acceleration/volatility level (k=7 max_a={row['max_a_k7']:.3f}): **{accel_level}**")
    lines.append(f"- Peak timing profile (k=7): {peak_profile}")
    lines.append(f"- k-dependence pattern: {k_pattern} (delta_mean_s={row['delta_mean_s_k11_k5']:.3f}, delta_max_a={row['delta_max_a_k11_k5']:.3f})")
    lines.append(f"- What stands out: {standout}")
    lines.append("")

markdown_text = "\n".join(lines) + "\n"
DOC_PATH.write_text(markdown_text, encoding="utf-8")

print(f"Saved interpretation markdown: {DOC_PATH}")
print("Preview:")
print("\n".join(lines[:32]))


## 7) Validation Checks

In [None]:
fig_paths = sorted([p for p in FIG_DIR.glob("*.png") if p.is_file()])

stats_df_check = pd.read_csv(TABLE_DIR / "novel_stacked_stats.csv")
manifest_df_check = pd.read_csv(TABLE_DIR / "novel_stacked_manifest.csv")
highlights_df_check = pd.read_csv(TABLE_DIR / "novel_stacked_highlights.csv")

checks = []
expected_books = metadata_df["id"].nunique()

checks.append({"check": "book_count_is_20", "expected": 20, "actual": int(expected_books), "pass": int(expected_books) == 20})
checks.append({"check": "figure_count", "expected": int(expected_books), "actual": len(fig_paths), "pass": len(fig_paths) == int(expected_books)})
checks.append({"check": "stats_rows", "expected": int(expected_books) * len(K_VALUES), "actual": len(stats_df_check), "pass": len(stats_df_check) == int(expected_books) * len(K_VALUES)})
checks.append({"check": "manifest_rows", "expected": int(expected_books), "actual": len(manifest_df_check), "pass": len(manifest_df_check) == int(expected_books)})
checks.append({"check": "highlights_rows", "expected": int(expected_books), "actual": len(highlights_df_check), "pass": len(highlights_df_check) == int(expected_books)})

interp_text = DOC_PATH.read_text(encoding="utf-8")
section_count = interp_text.count("### [")
image_count = interp_text.count("../outputs/eda/novel_stacks/figures/")
checks.append({"check": "interpretation_sections", "expected": int(expected_books), "actual": int(section_count), "pass": int(section_count) == int(expected_books)})
checks.append({"check": "interpretation_image_links", "expected": int(expected_books), "actual": int(image_count), "pass": int(image_count) == int(expected_books)})

# Verify image links resolve.
missing_links = []
for line in interp_text.splitlines():
    if "../outputs/eda/novel_stacks/figures/" in line:
        rel = line.split("(", 1)[1].rsplit(")", 1)[0]
        target = (DOC_PATH.parent / rel).resolve()
        if not target.exists():
            missing_links.append(str(target))

checks.append({"check": "image_link_targets_exist", "expected": 0, "actual": len(missing_links), "pass": len(missing_links) == 0})

validation_df = pd.DataFrame(checks)
validation_df.to_csv(TABLE_DIR / "novel_stacked_validation_checks.csv", index=False)

if not validation_df["pass"].all():
    failed = validation_df[~validation_df["pass"]]
    raise RuntimeError(f"Validation failed:\n{failed}")

print("All validation checks passed.")
display(validation_df)


## 8) Outputs
Generated outputs are grouped and cleaned under:
- `outputs/eda/novel_stacks/figures/` (20 stacked PNGs)
- `outputs/eda/novel_stacks/tables/` (stats, manifest, highlights, validation)

Generated interpretation markdown:
- `docs/NOVEL_STACKED_OUTPUT_INTERPRETATION.md`
