# 08 Excitement Label Variant Analysis

Compares three LLM excitement label variants and deep-dives on `indep_winsize_5`.

Variants:
- `base` -> `label.npy`
- `winsize_5` -> `label_winsize_5.npy`
- `indep_winsize_5` -> `label_indep_winsize_5.npy`

This run uses `MA_WINDOW=5` for all moving-average presentation outputs.


In [1]:
from __future__ import annotations

from pathlib import Path
from itertools import combinations
import json

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

SEED = 42
rng = np.random.default_rng(SEED)
np.random.seed(SEED)

PROJECT_ROOT = Path('.').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
METADATA_PATH = DATA_DIR / 'metadata.csv'
SPLIT_PATH = PROJECT_ROOT / 'outputs' / 'excitement_linear' / 'tables' / 'split_manifest.csv'

OUT_ROOT = PROJECT_ROOT / 'outputs' / 'excitement_variant_analysis'
FIG_DIR = OUT_ROOT / 'figures'
TABLE_DIR = OUT_ROOT / 'tables'
MODEL_DIR = OUT_ROOT / 'model'
for d in [OUT_ROOT, FIG_DIR, TABLE_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

VARIANT_FILES = {
    'base': 'label.npy',
    'winsize_5': 'label_winsize_5.npy',
    'indep_winsize_5': 'label_indep_winsize_5.npy',
}
VARIANTS = list(VARIANT_FILES.keys())

EPOCHS = 200
BATCH_SIZE = 4096
LR = 1e-2
WEIGHT_DECAY = 1e-4
EPS = 1e-8
MA_WINDOW = 5

print(f'PROJECT_ROOT={PROJECT_ROOT}')
print(f'OUT_ROOT={OUT_ROOT}')
print('Variants:', VARIANT_FILES)
print(f'SEED={SEED}, EPOCHS={EPOCHS}, BATCH_SIZE={BATCH_SIZE}, LR={LR}, WEIGHT_DECAY={WEIGHT_DECAY}, MA_WINDOW={MA_WINDOW}')


PROJECT_ROOT=/Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis
OUT_ROOT=/Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_variant_analysis
Variants: {'base': 'label.npy', 'winsize_5': 'label_winsize_5.npy', 'indep_winsize_5': 'label_indep_winsize_5.npy'}
SEED=42, EPOCHS=200, BATCH_SIZE=4096, LR=0.01, WEIGHT_DECAY=0.0001, MA_WINDOW=5


In [2]:
def normalize_label_shape(y: np.ndarray, label_path: Path) -> np.ndarray:
    y = np.asarray(y)
    if y.ndim == 1:
        out = y
    elif y.ndim == 2 and 1 in y.shape:
        out = y.reshape(-1)
    else:
        raise ValueError(f'Unsupported label shape {y.shape} at {label_path}')
    return out


def moving_average_1d(x: np.ndarray, window: int) -> np.ndarray:
    x = np.asarray(x, dtype=np.float64).reshape(-1)
    if window <= 1:
        return x.copy()
    if window % 2 == 0:
        raise ValueError(f'MA window must be odd; got {window}')
    return pd.Series(x).rolling(window=window, center=True, min_periods=1).mean().to_numpy()


def metric_dict(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    y_true = np.asarray(y_true, dtype=np.float64).reshape(-1)
    y_pred = np.asarray(y_pred, dtype=np.float64).reshape(-1)
    err = y_pred - y_true
    mse = float(np.mean(err ** 2))
    rmse = float(np.sqrt(mse))
    mae = float(np.mean(np.abs(err)))
    denom = float(np.sum((y_true - np.mean(y_true)) ** 2))
    r2 = float(1 - np.sum(err ** 2) / denom) if denom > 0 else np.nan
    corr = float(np.corrcoef(y_true, y_pred)[0, 1]) if len(y_true) > 1 else np.nan
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2, 'corr': corr}


def iter_minibatches(n: int, batch_size: int, rng: np.random.Generator):
    idx = rng.permutation(n)
    for start in range(0, n, batch_size):
        yield idx[start:start + batch_size]


def winsize_5_constancy_violations(x: np.ndarray) -> tuple[int, int]:
    x = np.asarray(x).reshape(-1)
    violations = 0
    total_blocks = 0
    for s in range(0, len(x), 5):
        blk = x[s:s+5]
        if len(blk) > 1:
            total_blocks += 1
            if not np.all(blk == blk[0]):
                violations += 1
    return violations, total_blocks


def required_output_files(test_ids: list[int], train_ids: list[int]) -> list[Path]:
    files = [
        TABLE_DIR / 'integrity_checks.csv',
        TABLE_DIR / 'split_manifest_used.csv',
        TABLE_DIR / 'label_distribution_by_variant.csv',
        TABLE_DIR / 'variant_pairwise_agreement_global.csv',
        TABLE_DIR / 'variant_pairwise_agreement_per_book.csv',
        TABLE_DIR / 'model_global_metrics_by_variant.csv',
        TABLE_DIR / 'model_per_novel_metrics_by_variant.csv',
        TABLE_DIR / 'indep_winsize_5_support_stats.csv',
        FIG_DIR / 'labels_grid_base.png',
        FIG_DIR / 'labels_grid_winsize_5.png',
        FIG_DIR / 'labels_grid_indep_winsize_5.png',
        FIG_DIR / 'label_overlay_normpos_by_variant.png',
        FIG_DIR / 'label_distribution_by_variant.png',
        FIG_DIR / 'variant_pairwise_agreement_bar.png',
        FIG_DIR / 'train_loss_curves_by_variant.png',
        FIG_DIR / 'model_metric_comparison_by_variant.png',
        FIG_DIR / 'indep_prediction_scatter_train_test.png',
        FIG_DIR / 'indep_residual_hist_train_test.png',
        FIG_DIR / 'indep_mae_raw_vs_moving_average.png',
        MODEL_DIR / 'linear_weights_base.npz',
        MODEL_DIR / 'linear_weights_winsize_5.npz',
        MODEL_DIR / 'linear_weights_indep_winsize_5.npz',
        OUT_ROOT / 'insights.md',
    ]
    files.extend([FIG_DIR / f'indep_novel_overlay_test_{bid}.png' for bid in test_ids])
    files.extend([FIG_DIR / f'indep_novel_overlay_train_{bid}.png' for bid in train_ids])
    return files


In [3]:
# Load metadata + split + all required artifacts
meta = pd.read_csv(METADATA_PATH)
split_df = pd.read_csv(SPLIT_PATH)

required_split_cols = {'book_id', 'title', 'processed_dir', 'T', 'split'}
if not required_split_cols.issubset(split_df.columns):
    raise ValueError(f'Split manifest missing columns: {required_split_cols - set(split_df.columns)}')

split_df = split_df.copy()
split_df['book_id'] = split_df['book_id'].astype(int)
split_df['split'] = split_df['split'].astype(str)
if not set(split_df['split']).issubset({'train', 'test'}):
    raise ValueError('Split manifest has invalid split labels')

train_ids = sorted(split_df.loc[split_df['split'] == 'train', 'book_id'].tolist())
test_ids = sorted(split_df.loc[split_df['split'] == 'test', 'book_id'].tolist())

integrity_rows = []
integrity_rows.extend([
    {'check': 'split_train_count', 'expected': 16, 'actual': len(train_ids), 'pass': len(train_ids) == 16},
    {'check': 'split_test_count', 'expected': 4, 'actual': len(test_ids), 'pass': len(test_ids) == 4},
    {'check': 'split_no_overlap', 'expected': True, 'actual': len(set(train_ids).intersection(set(test_ids))) == 0, 'pass': len(set(train_ids).intersection(set(test_ids))) == 0},
    {'check': 'split_total_books', 'expected': 20, 'actual': len(set(train_ids + test_ids)), 'pass': len(set(train_ids + test_ids)) == 20},
])

split_df.sort_values(['split', 'book_id']).to_csv(TABLE_DIR / 'split_manifest_used.csv', index=False)

payloads = []
for r in meta.itertuples(index=False):
    book_id = int(r.id)
    processed_dir = str(r.processed_dir)
    title = str(r.title)
    pdir = PROCESSED_DIR / processed_dir

    emb_path = pdir / 'embeddings.npy'
    if not emb_path.exists():
        raise FileNotFoundError(f'Missing embeddings: {emb_path}')
    X = np.load(emb_path)
    if X.ndim != 2:
        raise ValueError(f'Expected 2D embeddings at {emb_path}, got {X.shape}')
    T, D = X.shape

    split_match = split_df[split_df['book_id'] == book_id]
    if len(split_match) != 1:
        raise ValueError(f'Expected exactly one split row for book_id={book_id}, got {len(split_match)}')
    split_label = split_match.iloc[0]['split']

    labels = {}
    for variant, fname in VARIANT_FILES.items():
        path = pdir / fname
        if not path.exists():
            raise FileNotFoundError(f'Missing label file for variant={variant}: {path}')

        y_raw = np.load(path)
        y = normalize_label_shape(y_raw, path).astype(np.float64)

        in_range = bool(np.all((y >= 0) & (y <= 4)))
        int_like = bool(np.allclose(y, np.round(y)))
        align = int(len(y)) == int(T)

        integrity_rows.append({
            'check': f'book_{book_id}_{variant}_alignment',
            'expected': 'len==T, integer-like labels in [0,4]',
            'actual': f'len={len(y)}, T={T}, min={float(y.min()):.3f}, max={float(y.max()):.3f}, int_like={int_like}',
            'pass': bool(align and in_range and int_like),
        })

        if not align:
            raise ValueError(f'Length mismatch for {path}: len(y)={len(y)} vs T={T}')
        if not in_range:
            raise ValueError(f'Out-of-range labels for {path}')
        if not int_like:
            raise ValueError(f'Non-integer-like labels for {path}')

        labels[variant] = y

    payloads.append({
        'book_id': book_id,
        'title': title,
        'processed_dir': processed_dir,
        'split': split_label,
        'T': int(T),
        'D': int(D),
        'X': X.astype(np.float64),
        'labels': labels,
    })

payloads = sorted(payloads, key=lambda x: x['book_id'])
if len(payloads) != 20:
    raise ValueError(f'Expected 20 books in payloads, found {len(payloads)}')

print('Loaded books:', len(payloads))
print('Embedding dim:', payloads[0]['D'])
print('Train/Test books:', len(train_ids), len(test_ids))


Loaded books: 20
Embedding dim: 768
Train/Test books: 16 4


In [4]:
# Cross-variant descriptive tables + figures

# 1) Label distribution by variant (global + per-book)
dist_rows = []
for variant in VARIANTS:
    all_y = np.concatenate([p['labels'][variant] for p in payloads]).astype(int)
    vals, cnts = np.unique(all_y, return_counts=True)
    total = int(len(all_y))
    counts_map = {int(v): int(c) for v, c in zip(vals, cnts)}
    for label in range(5):
        c = counts_map.get(label, 0)
        dist_rows.append({
            'variant': variant,
            'label': int(label),
            'count': int(c),
            'proportion': float(c / total if total > 0 else np.nan),
            'scope': 'global',
            'book_id': np.nan,
            'processed_dir': np.nan,
            'title': np.nan,
        })

for p in payloads:
    for variant in VARIANTS:
        y = p['labels'][variant].astype(int)
        vals, cnts = np.unique(y, return_counts=True)
        total = int(len(y))
        counts_map = {int(v): int(c) for v, c in zip(vals, cnts)}
        for label in range(5):
            c = counts_map.get(label, 0)
            dist_rows.append({
                'variant': variant,
                'label': int(label),
                'count': int(c),
                'proportion': float(c / total if total > 0 else np.nan),
                'scope': 'book',
                'book_id': int(p['book_id']),
                'processed_dir': p['processed_dir'],
                'title': p['title'],
            })

label_dist_df = pd.DataFrame(dist_rows)
label_dist_df.to_csv(TABLE_DIR / 'label_distribution_by_variant.csv', index=False)

# 2) Pairwise agreement (global + per-book)
pair_global_rows = []
pair_book_rows = []
pairs = list(combinations(VARIANTS, 2))

for va, vb in pairs:
    a = np.concatenate([p['labels'][va] for p in payloads])
    b = np.concatenate([p['labels'][vb] for p in payloads])
    pair_global_rows.append({
        'variant_a': va,
        'variant_b': vb,
        'mae': float(np.mean(np.abs(a - b))),
        'exact_match': float(np.mean(a == b)),
        'corr': float(np.corrcoef(a, b)[0, 1]),
        'n_samples': int(len(a)),
    })

    for p in payloads:
        ya = p['labels'][va]
        yb = p['labels'][vb]
        pair_book_rows.append({
            'book_id': int(p['book_id']),
            'processed_dir': p['processed_dir'],
            'variant_a': va,
            'variant_b': vb,
            'mae': float(np.mean(np.abs(ya - yb))),
            'exact_match': float(np.mean(ya == yb)),
            'corr': float(np.corrcoef(ya, yb)[0, 1]) if len(ya) > 1 else np.nan,
            'T': int(p['T']),
        })

pair_global_df = pd.DataFrame(pair_global_rows)
pair_book_df = pd.DataFrame(pair_book_rows)
pair_global_df.to_csv(TABLE_DIR / 'variant_pairwise_agreement_global.csv', index=False)
pair_book_df.to_csv(TABLE_DIR / 'variant_pairwise_agreement_per_book.csv', index=False)

# 3) winsize_5 constancy integrity
viol_total, block_total = 0, 0
for p in payloads:
    v, b = winsize_5_constancy_violations(p['labels']['winsize_5'])
    viol_total += v
    block_total += b
integrity_rows.append({
    'check': 'winsize_5_block_constancy',
    'expected': '0 violations',
    'actual': f'{viol_total}/{block_total}',
    'pass': viol_total == 0,
})

# Figures: 20-book grid per variant
nrows, ncols = 5, 4
for variant in VARIANTS:
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(22, 18), sharey=True)
    axes = axes.flatten()
    for ax, p in zip(axes, payloads):
        y = p['labels'][variant]
        t = np.arange(len(y))
        ax.plot(t, y, linewidth=0.8, alpha=0.9)
        ax.set_title(f"{p['book_id']} | {p['title'][:30]}", fontsize=9)
        ax.set_ylim(-0.2, 4.2)
        ax.grid(alpha=0.2)
    for ax in axes[len(payloads):]:
        ax.axis('off')
    fig.suptitle(f'LLM Excitement Label per Chunk ({variant})', fontsize=16)
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'labels_grid_{variant}.png', dpi=180, bbox_inches='tight')
    plt.close(fig)

# Figure: normalized-position overlay by variant (3 panels)
grid = np.linspace(0, 1, 201)
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14, 14), sharex=True, sharey=True)
for ax, variant in zip(axes, VARIANTS):
    curves = []
    for p in payloads:
        y = p['labels'][variant]
        x = np.linspace(0, 1, len(y))
        ax.plot(x, y, alpha=0.08, linewidth=1.0)
        curves.append(np.interp(grid, x, y))
    mean_curve = np.mean(np.vstack(curves), axis=0)
    ax.plot(grid, mean_curve, color='black', linewidth=2.2, label='mean profile')
    ax.set_title(f'Overlay by normalized position: {variant}')
    ax.set_ylabel('Excitement')
    ax.grid(alpha=0.2)
    ax.legend(loc='upper right')
axes[-1].set_xlabel('Normalized position in novel')
fig.tight_layout()
fig.savefig(FIG_DIR / 'label_overlay_normpos_by_variant.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# Figure: global label distribution by variant
global_dist = label_dist_df[label_dist_df['scope'] == 'global'].copy()
pivot = global_dist.pivot_table(index='variant', columns='label', values='count', aggfunc='sum').fillna(0)
pivot = pivot.reindex(index=VARIANTS, columns=[0,1,2,3,4])
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#4c78a8', '#f58518', '#54a24b', '#e45756', '#72b7b2']
bottom = np.zeros(len(pivot))
for i, label in enumerate([0,1,2,3,4]):
    vals = pivot[label].to_numpy()
    ax.bar(np.arange(len(pivot)), vals, bottom=bottom, label=f'label {label}', color=colors[i], alpha=0.9)
    bottom += vals
ax.set_xticks(np.arange(len(pivot)))
ax.set_xticklabels(pivot.index)
ax.set_ylabel('Chunk count')
ax.set_title('Global label distribution by variant')
ax.legend(ncol=5, loc='upper center', bbox_to_anchor=(0.5, 1.15))
ax.grid(axis='y', alpha=0.25)
fig.tight_layout()
fig.savefig(FIG_DIR / 'label_distribution_by_variant.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# Figure: pairwise agreement bar chart
pair_plot = pair_global_df.copy()
pair_plot['pair'] = pair_plot['variant_a'] + ' vs ' + pair_plot['variant_b']
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4.5))
metrics = [('mae', 'MAE'), ('exact_match', 'Exact match'), ('corr', 'Correlation')]
for ax, (col, title) in zip(axes, metrics):
    ax.bar(pair_plot['pair'], pair_plot[col], color=['#4c78a8', '#f58518', '#54a24b'])
    ax.set_title(title)
    ax.tick_params(axis='x', rotation=20)
    ax.grid(axis='y', alpha=0.2)
fig.suptitle('Global pairwise agreement across label variants', y=1.02)
fig.tight_layout()
fig.savefig(FIG_DIR / 'variant_pairwise_agreement_bar.png', dpi=180, bbox_inches='tight')
plt.close(fig)

print('Saved descriptive tables and figures.')


Saved descriptive tables and figures.


In [5]:
# Train linear models for all variants

model_histories = {}
model_cache = {}
global_rows = []
per_book_rows = []
indep_support_rows = []

train_id_set = set(train_ids)

for variant in VARIANTS:
    # Build train/test arrays
    X_train_list, y_train_list = [], []
    X_test_list, y_test_list = [], []

    for p in payloads:
        y = p['labels'][variant].reshape(-1)
        if p['split'] == 'train':
            X_train_list.append(p['X'])
            y_train_list.append(y)
        else:
            X_test_list.append(p['X'])
            y_test_list.append(y)

    X_train = np.vstack(X_train_list)
    y_train = np.concatenate(y_train_list).reshape(-1, 1)
    X_test = np.vstack(X_test_list)
    y_test = np.concatenate(y_test_list).reshape(-1, 1)

    x_mean = X_train.mean(axis=0, keepdims=True)
    x_std = X_train.std(axis=0, keepdims=True)
    x_std_safe = np.where(x_std < EPS, 1.0, x_std)

    X_train_n = (X_train - x_mean) / x_std_safe
    X_test_n = (X_test - x_mean) / x_std_safe

    D = X_train_n.shape[1]
    W = rng.normal(loc=0.0, scale=0.01, size=(D, 1))
    b = np.zeros((1,), dtype=np.float64)

    train_loss_hist, test_loss_hist = [], []

    for epoch in range(EPOCHS):
        for batch_idx in iter_minibatches(X_train_n.shape[0], BATCH_SIZE, rng):
            Xb = X_train_n[batch_idx]
            yb = y_train[batch_idx]

            pred = Xb @ W + b
            err = pred - yb

            grad_W = (2.0 / len(Xb)) * (Xb.T @ err) + 2.0 * WEIGHT_DECAY * W
            grad_b = (2.0 / len(Xb)) * np.sum(err, axis=0)

            W -= LR * grad_W
            b -= LR * grad_b

        train_pred_epoch = (X_train_n @ W + b).reshape(-1)
        test_pred_epoch = (X_test_n @ W + b).reshape(-1)
        train_loss_hist.append(float(np.mean((train_pred_epoch - y_train.reshape(-1)) ** 2)))
        test_loss_hist.append(float(np.mean((test_pred_epoch - y_test.reshape(-1)) ** 2)))

    train_pred = (X_train_n @ W + b).reshape(-1)
    test_pred = (X_test_n @ W + b).reshape(-1)

    train_m = metric_dict(y_train.reshape(-1), train_pred)
    test_m = metric_dict(y_test.reshape(-1), test_pred)

    # MAE on MA(5) computed per-book to avoid boundary artifacts
    abs_err_ma = {'train': [], 'test': []}

    # per-book metrics + cache per-book predictions
    per_book_pred = {}
    for p in payloads:
        Xp = (p['X'] - x_mean) / x_std_safe
        y_true = p['labels'][variant].reshape(-1)
        y_pred = (Xp @ W + b).reshape(-1)
        per_book_pred[p['book_id']] = y_pred

        y_true_ma = moving_average_1d(y_true, MA_WINDOW)
        y_pred_ma = moving_average_1d(y_pred, MA_WINDOW)
        mae_ma = float(np.mean(np.abs(y_pred_ma - y_true_ma)))

        split = p['split']
        abs_err_ma[split].append(np.abs(y_pred_ma - y_true_ma))

        m = metric_dict(y_true, y_pred)
        per_book_rows.append({
            'variant': variant,
            'book_id': int(p['book_id']),
            'title': p['title'],
            'processed_dir': p['processed_dir'],
            'split': split,
            'T': int(p['T']),
            'mse': m['mse'],
            'rmse': m['rmse'],
            'mae': m['mae'],
            'r2': m['r2'],
            'corr': m['corr'],
            'mae_ma': mae_ma,
        })

        if variant == 'indep_winsize_5':
            res = y_pred - y_true
            indep_support_rows.append({
                'book_id': int(p['book_id']),
                'split': split,
                'T': int(p['T']),
                'y_true_mean': float(np.mean(y_true)),
                'y_true_std': float(np.std(y_true)),
                'y_pred_mean': float(np.mean(y_pred)),
                'y_pred_std': float(np.std(y_pred)),
                'pred_min': float(np.min(y_pred)),
                'pred_max': float(np.max(y_pred)),
                'res_mean': float(np.mean(res)),
                'res_std': float(np.std(res)),
                'res_p05': float(np.quantile(res, 0.05)),
                'res_p95': float(np.quantile(res, 0.95)),
                'corr_true_pred': float(np.corrcoef(y_true, y_pred)[0, 1]) if len(y_true) > 1 else np.nan,
                'title': p['title'],
            })

    train_abs_ma = np.concatenate(abs_err_ma['train']) if len(abs_err_ma['train']) else np.array([np.nan])
    test_abs_ma = np.concatenate(abs_err_ma['test']) if len(abs_err_ma['test']) else np.array([np.nan])

    global_rows.extend([
        {
            'variant': variant,
            'split': 'train',
            'n_samples': int(len(y_train)),
            'n_novels': int(len(train_ids)),
            'mse': train_m['mse'],
            'rmse': train_m['rmse'],
            'mae': train_m['mae'],
            'r2': train_m['r2'],
            'corr': train_m['corr'],
            'ma_window': MA_WINDOW,
            'mae_ma': float(np.mean(train_abs_ma)),
        },
        {
            'variant': variant,
            'split': 'test',
            'n_samples': int(len(y_test)),
            'n_novels': int(len(test_ids)),
            'mse': test_m['mse'],
            'rmse': test_m['rmse'],
            'mae': test_m['mae'],
            'r2': test_m['r2'],
            'corr': test_m['corr'],
            'ma_window': MA_WINDOW,
            'mae_ma': float(np.mean(test_abs_ma)),
        },
    ])

    np.savez(
        MODEL_DIR / f'linear_weights_{variant}.npz',
        W=W.astype(np.float32),
        b=b.astype(np.float32),
        x_mean=x_mean.reshape(-1).astype(np.float32),
        x_std=x_std_safe.reshape(-1).astype(np.float32),
        seed=np.array([SEED], dtype=np.int32),
        lr=np.array([LR], dtype=np.float32),
        epochs=np.array([EPOCHS], dtype=np.int32),
        batch_size=np.array([BATCH_SIZE], dtype=np.int32),
        weight_decay=np.array([WEIGHT_DECAY], dtype=np.float32),
        variant=np.array([variant]),
    )

    model_histories[variant] = {
        'train_loss': train_loss_hist,
        'test_loss': test_loss_hist,
    }
    model_cache[variant] = {
        'W': W,
        'b': b,
        'x_mean': x_mean,
        'x_std_safe': x_std_safe,
        'per_book_pred': per_book_pred,
        'y_train_true': y_train.reshape(-1),
        'y_test_true': y_test.reshape(-1),
        'y_train_pred': train_pred,
        'y_test_pred': test_pred,
    }

model_global_df = pd.DataFrame(global_rows).sort_values(['variant', 'split']).reset_index(drop=True)
model_per_book_df = pd.DataFrame(per_book_rows).sort_values(['variant', 'split', 'book_id']).reset_index(drop=True)
indep_support_df = pd.DataFrame(indep_support_rows).sort_values(['split', 'book_id']).reset_index(drop=True)

model_global_df.to_csv(TABLE_DIR / 'model_global_metrics_by_variant.csv', index=False)
model_per_book_df.to_csv(TABLE_DIR / 'model_per_novel_metrics_by_variant.csv', index=False)
indep_support_df.to_csv(TABLE_DIR / 'indep_winsize_5_support_stats.csv', index=False)

print('Saved model tables and model weights.')
print(model_global_df)


Saved model tables and model weights.
           variant  split  n_samples  n_novels       mse      rmse       mae  \
0             base   test       3706         4  1.424219  1.193406  1.040990   
1             base  train      17631        16  1.447624  1.203172  1.063871   
2  indep_winsize_5   test       3706         4  0.561263  0.749175  0.612690   
3  indep_winsize_5  train      17631        16  0.528095  0.726702  0.592890   
4        winsize_5   test       3706         4  0.979047  0.989468  0.768356   
5        winsize_5  train      17631        16  0.855820  0.925105  0.724276   

         r2      corr  ma_window    mae_ma  
0  0.136429  0.379810          5  0.460144  
1  0.173382  0.417272          5  0.438537  
2  0.075496  0.306017          5  0.270080  
3  0.193136  0.439557          5  0.244334  
4  0.152135  0.411479          5  0.620573  
5  0.282322  0.531895          5  0.569507  


In [6]:
# Figures from modeling + indep deep dive

# train loss curves by variant
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(11, 12), sharex=True)
for ax, variant in zip(axes, VARIANTS):
    tr = model_histories[variant]['train_loss']
    te = model_histories[variant]['test_loss']
    ax.plot(np.arange(1, EPOCHS + 1), tr, label='train_mse', linewidth=1.6)
    ax.plot(np.arange(1, EPOCHS + 1), te, label='test_mse', linewidth=1.4)
    ax.set_title(f'Loss curves: {variant}')
    ax.set_ylabel('MSE')
    ax.grid(alpha=0.25)
    ax.legend()
axes[-1].set_xlabel('Epoch')
fig.tight_layout()
fig.savefig(FIG_DIR / 'train_loss_curves_by_variant.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# test metric comparison by variant
test_metrics = model_global_df[model_global_df['split'] == 'test'].copy().set_index('variant').loc[VARIANTS].reset_index()
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
metric_specs = [('rmse', 'Test RMSE'), ('mae', 'Test MAE'), ('r2', 'Test R2'), ('corr', 'Test Corr')]
for ax, (col, title) in zip(axes.flatten(), metric_specs):
    vals = test_metrics[col].to_numpy()
    ax.bar(np.arange(len(VARIANTS)), vals, color=['#4c78a8', '#f58518', '#54a24b'])
    ax.set_xticks(np.arange(len(VARIANTS)))
    ax.set_xticklabels(VARIANTS, rotation=20)
    ax.set_title(title)
    ax.grid(axis='y', alpha=0.25)
fig.tight_layout()
fig.savefig(FIG_DIR / 'model_metric_comparison_by_variant.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# indep variant diagnostics
indep = model_cache['indep_winsize_5']
y_train_true = indep['y_train_true']
y_train_pred = indep['y_train_pred']
y_test_true = indep['y_test_true']
y_test_pred = indep['y_test_pred']

# scatter
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(y_train_true, y_train_pred, s=8, alpha=0.15, label='train')
ax.scatter(y_test_true, y_test_pred, s=12, alpha=0.30, label='test')
lo = min(float(y_train_true.min()), float(y_test_true.min()), float(y_train_pred.min()), float(y_test_pred.min()))
hi = max(float(y_train_true.max()), float(y_test_true.max()), float(y_train_pred.max()), float(y_test_pred.max()))
ax.plot([lo, hi], [lo, hi], linestyle='--', color='black', linewidth=1.2)
ax.set_xlabel('True excitement')
ax.set_ylabel('Predicted excitement')
ax.set_title('indep_winsize_5: Prediction scatter')
ax.grid(alpha=0.2)
ax.legend()
fig.tight_layout()
fig.savefig(FIG_DIR / 'indep_prediction_scatter_train_test.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# residual hist
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(y_train_pred - y_train_true, bins=60, alpha=0.45, label='train')
ax.hist(y_test_pred - y_test_true, bins=60, alpha=0.55, label='test')
ax.set_xlabel('Residual (pred - true)')
ax.set_ylabel('Count')
ax.set_title('indep_winsize_5: Residual distribution')
ax.grid(alpha=0.2)
ax.legend()
fig.tight_layout()
fig.savefig(FIG_DIR / 'indep_residual_hist_train_test.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# raw vs MA(5) MAE bars for indep
indep_global = model_global_df[model_global_df['variant'] == 'indep_winsize_5'].copy().set_index('split').loc[['train', 'test']].reset_index()
fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(len(indep_global))
bw = 0.34
ax.bar(x - bw / 2, indep_global['mae'], width=bw, label='Raw MAE', alpha=0.9)
ax.bar(x + bw / 2, indep_global['mae_ma'], width=bw, label=f'MA({MA_WINDOW}) MAE', alpha=0.9)
ax.set_xticks(x)
ax.set_xticklabels(indep_global['split'])
ax.set_ylabel('MAE')
ax.set_title('indep_winsize_5: Raw vs MA(5) MAE')
ax.grid(axis='y', alpha=0.25)
ax.legend()
for i, row in indep_global.iterrows():
    ax.text(i - bw / 2, row['mae'] + 0.01, f"{row['mae']:.3f}", ha='center', va='bottom', fontsize=9)
    ax.text(i + bw / 2, row['mae_ma'] + 0.01, f"{row['mae_ma']:.3f}", ha='center', va='bottom', fontsize=9)
fig.tight_layout()
fig.savefig(FIG_DIR / 'indep_mae_raw_vs_moving_average.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# overlays for indep: 4 test + 2 train (best/worst train RMSE)
by_id = {p['book_id']: p for p in payloads}
indep_per = model_per_book_df[model_per_book_df['variant'] == 'indep_winsize_5'].copy()

train_rmse = indep_per[indep_per['split'] == 'train'][['book_id', 'rmse']].sort_values('rmse')
train_best = int(train_rmse.iloc[0]['book_id'])
train_worst = int(train_rmse.iloc[-1]['book_id'])
selected_train_overlay_ids = sorted(list({train_best, train_worst}))
if len(selected_train_overlay_ids) == 1:
    selected_train_overlay_ids.append(int(train_rmse.iloc[1]['book_id']))
    selected_train_overlay_ids = sorted(selected_train_overlay_ids)

for bid in test_ids:
    p = by_id[bid]
    y_true = p['labels']['indep_winsize_5'].reshape(-1)
    y_pred = model_cache['indep_winsize_5']['per_book_pred'][bid].reshape(-1)
    y_true_ma = moving_average_1d(y_true, MA_WINDOW)
    y_pred_ma = moving_average_1d(y_pred, MA_WINDOW)
    raw_mae = float(np.mean(np.abs(y_pred - y_true)))
    ma_mae = float(np.mean(np.abs(y_pred_ma - y_true_ma)))
    t = np.arange(len(y_true))

    fig, ax = plt.subplots(figsize=(12, 4.5))
    ax.plot(t, y_true, label='llm_label_raw', linewidth=1.0, alpha=0.30, color='tab:blue')
    ax.plot(t, y_pred, label='linear_pred_raw', linewidth=1.0, alpha=0.30, color='tab:orange')
    ax.plot(t, y_true_ma, label=f'llm_label_MA{MA_WINDOW}', linewidth=2.0, color='tab:blue')
    ax.plot(t, y_pred_ma, label=f'linear_pred_MA{MA_WINDOW}', linewidth=2.0, color='tab:orange')
    ax.set_title(f"indep test {bid} | {p['title']} | raw MAE={raw_mae:.3f}, MA({MA_WINDOW}) MAE={ma_mae:.3f}")
    ax.set_xlabel('Chunk index')
    ax.set_ylabel('Excitement')
    ax.set_ylim(-0.5, 4.5)
    ax.grid(alpha=0.25)
    ax.legend(ncol=2)
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'indep_novel_overlay_test_{bid}.png', dpi=180, bbox_inches='tight')
    plt.close(fig)

for bid in selected_train_overlay_ids:
    p = by_id[bid]
    y_true = p['labels']['indep_winsize_5'].reshape(-1)
    y_pred = model_cache['indep_winsize_5']['per_book_pred'][bid].reshape(-1)
    y_true_ma = moving_average_1d(y_true, MA_WINDOW)
    y_pred_ma = moving_average_1d(y_pred, MA_WINDOW)
    raw_mae = float(np.mean(np.abs(y_pred - y_true)))
    ma_mae = float(np.mean(np.abs(y_pred_ma - y_true_ma)))
    t = np.arange(len(y_true))

    fig, ax = plt.subplots(figsize=(12, 4.5))
    ax.plot(t, y_true, label='llm_label_raw', linewidth=1.0, alpha=0.30, color='tab:blue')
    ax.plot(t, y_pred, label='linear_pred_raw', linewidth=1.0, alpha=0.30, color='tab:orange')
    ax.plot(t, y_true_ma, label=f'llm_label_MA{MA_WINDOW}', linewidth=2.0, color='tab:blue')
    ax.plot(t, y_pred_ma, label=f'linear_pred_MA{MA_WINDOW}', linewidth=2.0, color='tab:orange')
    ax.set_title(f"indep train {bid} | {p['title']} | raw MAE={raw_mae:.3f}, MA({MA_WINDOW}) MAE={ma_mae:.3f}")
    ax.set_xlabel('Chunk index')
    ax.set_ylabel('Excitement')
    ax.set_ylim(-0.5, 4.5)
    ax.grid(alpha=0.25)
    ax.legend(ncol=2)
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'indep_novel_overlay_train_{bid}.png', dpi=180, bbox_inches='tight')
    plt.close(fig)

print('Saved modeling figures.')
print('Selected train indep overlays:', selected_train_overlay_ids)


Saved modeling figures.
Selected train indep overlays: [43, 1257]


In [7]:
# Insights markdown + finalize integrity checks

variant_test = model_global_df[model_global_df['split'] == 'test'].copy().sort_values('rmse')
indep_test = model_global_df[(model_global_df['variant'] == 'indep_winsize_5') & (model_global_df['split'] == 'test')].iloc[0]
indep_train = model_global_df[(model_global_df['variant'] == 'indep_winsize_5') & (model_global_df['split'] == 'train')].iloc[0]

pair_global = pd.read_csv(TABLE_DIR / 'variant_pairwise_agreement_global.csv')
label_dist = pd.read_csv(TABLE_DIR / 'label_distribution_by_variant.csv')
indep_dist = label_dist[(label_dist['scope'] == 'global') & (label_dist['variant'] == 'indep_winsize_5')].sort_values('label')

indep_per_test = model_per_book_df[(model_per_book_df['variant'] == 'indep_winsize_5') & (model_per_book_df['split'] == 'test')].sort_values('rmse')

insights_lines = [
    '# Excitement Variant Analysis Insights (MA(W), current run W=5)',
    '',
    '## 1) Dataset and Integrity Summary',
    f'- Books analyzed: **{len(payloads)}** (reused split: {len(train_ids)} train / {len(test_ids)} test novels).',
    '- All three label variants passed shape, range, and alignment checks (`len(label)==T`, labels in `[0,4]`, integer-like).',
    '- `winsize_5` block-constancy check passed (no within-block variation for 5-chunk blocks).',
    '',
    '## 2) Variant Comparison Findings',
    '- Test split global metrics by variant (lower RMSE/MAE better; higher R2/corr better):',
]
for _, row in variant_test.iterrows():
    insights_lines.append(
        f"  - `{row['variant']}`: RMSE={row['rmse']:.3f}, MAE={row['mae']:.3f}, R2={row['r2']:.3f}, corr={row['corr']:.3f}, MA(5) MAE={row['mae_ma']:.3f}"
    )
insights_lines += [
    '',
    '- Pairwise label agreement indicates the variants are materially different sources, not trivial rewrites.',
    '- See: `tables/variant_pairwise_agreement_global.csv` and `tables/variant_pairwise_agreement_per_book.csv`.',
    '',
    '## 3) indep_winsize_5 Verdict (Primary Focus)',
    '- Chunk-level reliability: **limited/moderate**. Raw pointwise error remains substantial.',
    '- Trend-level utility (MA(5)): **useful as a coarse proxy** for trajectory/pacing interpretation.',
    f"- indep global train: RMSE={indep_train['rmse']:.3f}, MAE={indep_train['mae']:.3f}, R2={indep_train['r2']:.3f}, corr={indep_train['corr']:.3f}, MA(5) MAE={indep_train['mae_ma']:.3f}.",
    f"- indep global test: RMSE={indep_test['rmse']:.3f}, MAE={indep_test['mae']:.3f}, R2={indep_test['r2']:.3f}, corr={indep_test['corr']:.3f}, MA(5) MAE={indep_test['mae_ma']:.3f}.",
    '',
    '## 4) Book-Level Highlights (indep_winsize_5, test novels)',
]
for _, row in indep_per_test.iterrows():
    insights_lines.append(
        f"- {int(row['book_id'])} | {row['title']}: RMSE={row['rmse']:.3f}, MAE={row['mae']:.3f}, R2={row['r2']:.3f}, corr={row['corr']:.3f}, MA(5) MAE={row['mae_ma']:.3f}."
    )
insights_lines += [
    '',
    '## 5) Use Now vs Avoid Now (indep_winsize_5)',
    '- Use now: relative trend profiling, chapter-level pacing summaries, cross-book smoothed trajectory comparison.',
    '- Avoid now: chunk-level absolute scoring, spike-triggered threshold decisions, fine-grained event detection from raw predictions.',
    '',
    '## 6) Next Experiments + Acceptance Criteria',
    '1. Compare Ridge/ElasticNet against current linear GD head on the same split.',
    '2. Add temporal features (`x_t - x_{t-1}`, short rolling context) while keeping linear head.',
    '3. Try imbalance-aware objectives for rare labels.',
    '4. Accept a new model only if both improve: test RMSE and worst-case test-novel RMSE.',
    '',
    '## Provenance',
    '- Figures: `outputs/excitement_variant_analysis/figures/*.png`',
    '- Tables: `outputs/excitement_variant_analysis/tables/*.csv`',
    '- Models: `outputs/excitement_variant_analysis/model/*.npz`',
]

insights_text = '\n'.join(insights_lines) + '\n'
(OUT_ROOT / 'insights.md').write_text(insights_text, encoding='utf-8')

# Write current integrity rows first so file-existence completeness can include integrity file itself
pd.DataFrame(integrity_rows).to_csv(TABLE_DIR / 'integrity_checks.csv', index=False)

# Final integrity checks: output completeness
expected_files = required_output_files(test_ids=test_ids, train_ids=selected_train_overlay_ids)
missing = [str(p) for p in expected_files if not p.exists()]

integrity_rows.extend([
    {'check': 'output_files_complete', 'expected': len(expected_files), 'actual': len(expected_files) - len(missing), 'pass': len(missing) == 0},
    {'check': 'indep_overlay_test_count', 'expected': 4, 'actual': len(list(FIG_DIR.glob('indep_novel_overlay_test_*.png'))), 'pass': len(list(FIG_DIR.glob('indep_novel_overlay_test_*.png'))) == 4},
    {'check': 'indep_overlay_train_count', 'expected': 2, 'actual': len(list(FIG_DIR.glob('indep_novel_overlay_train_*.png'))), 'pass': len(list(FIG_DIR.glob('indep_novel_overlay_train_*.png'))) == 2},
    {'check': 'ma_window_is_5', 'expected': 5, 'actual': int(model_global_df['ma_window'].iloc[0]), 'pass': bool((model_global_df['ma_window'] == 5).all())},
])

integrity_df = pd.DataFrame(integrity_rows)
integrity_df.to_csv(TABLE_DIR / 'integrity_checks.csv', index=False)

print('Saved insights markdown:', OUT_ROOT / 'insights.md')
print('Saved integrity checks:', TABLE_DIR / 'integrity_checks.csv')
print('Integrity all-pass:', bool(integrity_df['pass'].all()))
if missing:
    print('Missing files:')
    for m in missing:
        print('-', m)


Saved insights markdown: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_variant_analysis/insights.md
Saved integrity checks: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_variant_analysis/tables/integrity_checks.csv
Integrity all-pass: True
