# 07 - LLM Excitement Signal + Linear Projection from Embeddings

This notebook:
1. Visualizes `label.npy` excitement signals for all 20 novels.
2. Trains a 1-layer linear perceptron (NumPy GD, MSE) from chunk embeddings to excitement.
3. Uses a novel-level deterministic 80:20 split (16 train novels, 4 test novels).
4. Produces train/test diagnostics and per-novel overlay plots.

## Outputs
- `outputs/excitement_linear/figures/labels_all_20_novels_grid.png`
- `outputs/excitement_linear/figures/labels_all_20_novels_overlay_normpos.png`
- `outputs/excitement_linear/figures/train_loss_curve.png`
- `outputs/excitement_linear/figures/prediction_scatter_train_test.png`
- `outputs/excitement_linear/figures/residual_hist_train_test.png`
- `outputs/excitement_linear/figures/novel_overlay_test_{book_id}.png` (4 files)
- `outputs/excitement_linear/figures/novel_overlay_train_{book_id}.png` (2 files)
- `outputs/excitement_linear/tables/split_manifest.csv`
- `outputs/excitement_linear/tables/global_metrics.csv`
- `outputs/excitement_linear/tables/per_novel_metrics.csv`
- `outputs/excitement_linear/tables/integrity_checks.csv`
- `outputs/excitement_linear/model/linear_weights.npz`


In [43]:
from __future__ import annotations

from pathlib import Path
import json

import numpy as np
import pandas as pd

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

SEED = 42
rng = np.random.default_rng(SEED)
np.random.seed(SEED)

PROJECT_ROOT = Path('.').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
METADATA_PATH = DATA_DIR / 'metadata.csv'

OUT_ROOT = PROJECT_ROOT / 'outputs' / 'excitement_linear'
FIG_DIR = OUT_ROOT / 'figures'
TABLE_DIR = OUT_ROOT / 'tables'
MODEL_DIR = OUT_ROOT / 'model'
for d in [OUT_ROOT, FIG_DIR, TABLE_DIR, MODEL_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Optimization defaults
EPOCHS = 200
BATCH_SIZE = 4096
LR = 1e-2
WEIGHT_DECAY = 1e-4
EPS = 1e-8

# Presentation smoothing
MA_WINDOW = 9

print(f'Project root: {PROJECT_ROOT}')
print(f'Processed dir: {PROCESSED_DIR}')
print(f'Output root: {OUT_ROOT}')
print(f'SEED={SEED}, EPOCHS={EPOCHS}, BATCH_SIZE={BATCH_SIZE}, LR={LR}, WEIGHT_DECAY={WEIGHT_DECAY}, MA_WINDOW={MA_WINDOW}')


Project root: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis
Processed dir: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/data/processed
Output root: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_linear
SEED=42, EPOCHS=200, BATCH_SIZE=4096, LR=0.01, WEIGHT_DECAY=0.0001, MA_WINDOW=9


In [44]:
def normalize_label_shape(y: np.ndarray, label_path: Path) -> np.ndarray:
    y = np.asarray(y)
    if y.ndim == 1:
        out = y
    elif y.ndim == 2 and 1 in y.shape:
        out = y.reshape(-1)
    else:
        raise ValueError(f'Unsupported label shape {y.shape} at {label_path}')
    return out


def moving_average_1d(x: np.ndarray, window: int) -> np.ndarray:
    x = np.asarray(x, dtype=np.float64).reshape(-1)
    if window <= 1:
        return x.copy()
    if window % 2 == 0:
        raise ValueError(f'MA window must be odd for centered smoothing, got {window}')
    return pd.Series(x).rolling(window=window, center=True, min_periods=1).mean().to_numpy()


def metric_dict(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    y_true = np.asarray(y_true, dtype=np.float64).reshape(-1)
    y_pred = np.asarray(y_pred, dtype=np.float64).reshape(-1)
    err = y_pred - y_true
    mse = float(np.mean(err ** 2))
    rmse = float(np.sqrt(mse))
    mae = float(np.mean(np.abs(err)))

    denom = float(np.sum((y_true - np.mean(y_true)) ** 2))
    if denom <= 0:
        r2 = np.nan
    else:
        r2 = float(1 - np.sum(err ** 2) / denom)

    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
    }


def iter_minibatches(n: int, batch_size: int, rng: np.random.Generator):
    idx = rng.permutation(n)
    for start in range(0, n, batch_size):
        yield idx[start:start + batch_size]


def required_output_files(test_ids: list[int], train_ids: list[int]) -> list[Path]:
    files = [
        FIG_DIR / 'labels_all_20_novels_grid.png',
        FIG_DIR / 'labels_all_20_novels_overlay_normpos.png',
        FIG_DIR / 'train_loss_curve.png',
        FIG_DIR / 'prediction_scatter_train_test.png',
        FIG_DIR / 'residual_hist_train_test.png',
        FIG_DIR / 'mae_raw_vs_moving_average.png',
        TABLE_DIR / 'split_manifest.csv',
        TABLE_DIR / 'global_metrics.csv',
        TABLE_DIR / 'per_novel_metrics.csv',
        TABLE_DIR / 'presentation_mae.csv',
        MODEL_DIR / 'linear_weights.npz',
    ]
    files.extend([FIG_DIR / f'novel_overlay_test_{bid}.png' for bid in test_ids])
    files.extend([FIG_DIR / f'novel_overlay_train_{bid}.png' for bid in train_ids])
    return files


In [45]:
# 1) Load data + fail-fast validation
meta = pd.read_csv(METADATA_PATH)
if 'id' not in meta.columns and 'pg_id' in meta.columns:
    meta['id'] = meta['pg_id']
required_cols = ['id', 'title', 'processed_dir']
missing_cols = [c for c in required_cols if c not in meta.columns]
if missing_cols:
    raise ValueError(f'metadata.csv missing required columns: {missing_cols}')

meta = meta.sort_values('id').reset_index(drop=True)
if meta['id'].nunique() != 20:
    raise RuntimeError(f'Expected 20 unique books, found {meta["id"].nunique()}')

payloads = []
integrity_rows = []

for row in meta.to_dict(orient='records'):
    book_id = int(row['id'])
    title = str(row['title'])
    processed_dir = str(row['processed_dir'])
    base = PROCESSED_DIR / processed_dir

    emb_path = base / 'embeddings.npy'
    label_path = base / 'label.npy'

    if not emb_path.exists():
        raise FileNotFoundError(f'Missing embeddings.npy for book {book_id}: {emb_path}')
    if not label_path.exists():
        raise FileNotFoundError(f'Missing label.npy for book {book_id}: {label_path}')

    X = np.load(emb_path)
    y_raw = np.load(label_path)
    y = normalize_label_shape(y_raw, label_path)

    if X.ndim != 2:
        raise ValueError(f'Expected embeddings shape (T,D), got {X.shape} at {emb_path}')
    T, D = X.shape

    if len(y) != T:
        raise ValueError(f'Length mismatch for book {book_id}: label_len={len(y)} vs emb_T={T}')
    if not np.issubdtype(y.dtype, np.number):
        raise ValueError(f'label.npy must be numeric for book {book_id}, got dtype={y.dtype}')
    if np.min(y) < 0 or np.max(y) > 4:
        raise ValueError(f'label out of range [0,4] for book {book_id}: min={np.min(y)} max={np.max(y)}')

    payloads.append({
        'book_id': book_id,
        'title': title,
        'processed_dir': processed_dir,
        'X': X.astype(np.float64),
        'y': y.astype(np.float64),
        'T': int(T),
        'D': int(D),
    })

    integrity_rows.append({
        'check': f'book_{book_id}_alignment',
        'expected': 'label_len == emb_T and labels in [0,4]',
        'actual': f'label_len={len(y)}, emb_T={T}, min={float(np.min(y))}, max={float(np.max(y))}',
        'pass': True,
    })

D_unique = sorted({p['D'] for p in payloads})
if len(D_unique) != 1:
    raise RuntimeError(f'Expected a single embedding dimension across books, got {D_unique}')

print(f'Loaded books: {len(payloads)}')
print(f'Embedding dimension: {D_unique[0]}')
print(f'Total chunks: {sum(p["T"] for p in payloads)}')


Loaded books: 20
Embedding dimension: 768
Total chunks: 21337


In [46]:
# 2) Visualize labels for all 20 novels
payloads_sorted = sorted(payloads, key=lambda p: p['book_id'])

fig, axes = plt.subplots(5, 4, figsize=(24, 16), sharex=False, sharey=True)
axes = axes.flatten()
for ax, p in zip(axes, payloads_sorted):
    t = np.arange(p['T'])
    ax.plot(t, p['y'], linewidth=1.0, color='#1f77b4')
    ax.set_title(f"{p['book_id']} | {p['title'][:28]}", fontsize=9)
    ax.set_ylim(-0.2, 4.2)
    ax.grid(alpha=0.2)
for ax in axes[len(payloads_sorted):]:
    ax.axis('off')
fig.suptitle('LLM Excitement Label per Chunk (All 20 Novels)', fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.98])
fig.savefig(FIG_DIR / 'labels_all_20_novels_grid.png', dpi=180, bbox_inches='tight')
plt.close(fig)

fig, ax = plt.subplots(figsize=(14, 8))
for p in payloads_sorted:
    t_norm = np.linspace(0.0, 1.0, p['T'])
    ax.plot(t_norm, p['y'], alpha=0.35, linewidth=1.1, label=str(p['book_id']))
ax.set_title('Excitement Labels Overlay Across Novels (Normalized Position)')
ax.set_xlabel('Normalized position in novel')
ax.set_ylabel('Excitement label (0-4)')
ax.set_ylim(-0.2, 4.2)
ax.grid(alpha=0.2)
fig.tight_layout()
fig.savefig(FIG_DIR / 'labels_all_20_novels_overlay_normpos.png', dpi=180, bbox_inches='tight')
plt.close(fig)

print('Saved label visualizations.')


Saved label visualizations.


In [47]:
# 3) Deterministic novel-level 80:20 split (16/4)
book_ids = np.array([p['book_id'] for p in payloads_sorted], dtype=int)
rng_split = np.random.default_rng(SEED)
book_ids_shuffled = book_ids.copy()
rng_split.shuffle(book_ids_shuffled)

train_ids = sorted(book_ids_shuffled[:16].tolist())
test_ids = sorted(book_ids_shuffled[16:].tolist())

if len(train_ids) != 16 or len(test_ids) != 4:
    raise RuntimeError(f'Unexpected split sizes: train={len(train_ids)}, test={len(test_ids)}')
if set(train_ids) & set(test_ids):
    raise RuntimeError('Train/Test split overlap detected.')

split_rows = []
for p in payloads_sorted:
    split_rows.append({
        'book_id': p['book_id'],
        'title': p['title'],
        'processed_dir': p['processed_dir'],
        'T': p['T'],
        'split': 'train' if p['book_id'] in set(train_ids) else 'test',
    })

split_df = pd.DataFrame(split_rows).sort_values(['split', 'book_id']).reset_index(drop=True)
split_df.to_csv(TABLE_DIR / 'split_manifest.csv', index=False)

integrity_rows.extend([
    {'check': 'split_train_count', 'expected': 16, 'actual': len(train_ids), 'pass': len(train_ids) == 16},
    {'check': 'split_test_count', 'expected': 4, 'actual': len(test_ids), 'pass': len(test_ids) == 4},
    {'check': 'split_no_overlap', 'expected': True, 'actual': len(set(train_ids) & set(test_ids)) == 0, 'pass': len(set(train_ids) & set(test_ids)) == 0},
    {'check': 'split_total_books', 'expected': 20, 'actual': len(set(train_ids + test_ids)), 'pass': len(set(train_ids + test_ids)) == 20},
])

print('Train IDs:', train_ids)
print('Test IDs:', test_ids)
print(f'Saved split manifest: {TABLE_DIR / "split_manifest.csv"}')


Train IDs: [11, 35, 36, 43, 55, 84, 103, 120, 175, 345, 521, 1184, 1257, 1260, 1513, 1661]
Test IDs: [16, 113, 768, 1342]
Saved split manifest: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_linear/tables/split_manifest.csv


In [48]:
# 4) Assemble datasets and standardize features using train stats only
train_set = [p for p in payloads_sorted if p['book_id'] in set(train_ids)]
test_set = [p for p in payloads_sorted if p['book_id'] in set(test_ids)]

X_train = np.vstack([p['X'] for p in train_set])
y_train = np.concatenate([p['y'] for p in train_set]).reshape(-1, 1)
X_test = np.vstack([p['X'] for p in test_set])
y_test = np.concatenate([p['y'] for p in test_set]).reshape(-1, 1)

x_mean = X_train.mean(axis=0, keepdims=True)
x_std = X_train.std(axis=0, keepdims=True)
x_std_safe = np.where(x_std < EPS, 1.0, x_std)

X_train_z = (X_train - x_mean) / x_std_safe
X_test_z = (X_test - x_mean) / x_std_safe

print('X_train', X_train_z.shape, 'y_train', y_train.shape)
print('X_test ', X_test_z.shape, 'y_test ', y_test.shape)


X_train (17631, 768) y_train (17631, 1)
X_test  (3706, 768) y_test  (3706, 1)


In [49]:
# 5) Train 1-layer linear perceptron with NumPy GD (MSE)
N_train, D = X_train_z.shape
W = rng.normal(loc=0.0, scale=0.01, size=(D, 1))
b = np.zeros((1,), dtype=np.float64)

train_loss_hist = []
test_loss_hist = []

for epoch in range(EPOCHS):
    for idx in iter_minibatches(N_train, BATCH_SIZE, rng):
        xb = X_train_z[idx]
        yb = y_train[idx]

        pred = xb @ W + b
        err = pred - yb

        grad_W = (2.0 / len(idx)) * (xb.T @ err) + WEIGHT_DECAY * W
        grad_b = (2.0 / len(idx)) * np.sum(err, axis=0)

        W -= LR * grad_W
        b -= LR * grad_b

    train_pred_epoch = X_train_z @ W + b
    test_pred_epoch = X_test_z @ W + b

    train_mse = float(np.mean((train_pred_epoch - y_train) ** 2))
    test_mse = float(np.mean((test_pred_epoch - y_test) ** 2))

    train_loss_hist.append(train_mse)
    test_loss_hist.append(test_mse)

    if (epoch + 1) % 25 == 0 or epoch == 0:
        print(f"Epoch {epoch+1:03d}/{EPOCHS} | train_mse={train_mse:.6f} | test_mse={test_mse:.6f}")

train_pred = (X_train_z @ W + b).reshape(-1)
test_pred = (X_test_z @ W + b).reshape(-1)

integrity_rows.extend([
    {'check': 'model_W_shape', 'expected': '(768,1)', 'actual': str(W.shape), 'pass': tuple(W.shape) == (768, 1)},
    {'check': 'model_b_shape', 'expected': '(1,)', 'actual': str(b.shape), 'pass': tuple(b.shape) == (1,)},
    {'check': 'prediction_shape_train', 'expected': y_train.reshape(-1).shape[0], 'actual': train_pred.shape[0], 'pass': train_pred.shape[0] == y_train.reshape(-1).shape[0]},
    {'check': 'prediction_shape_test', 'expected': y_test.reshape(-1).shape[0], 'actual': test_pred.shape[0], 'pass': test_pred.shape[0] == y_test.reshape(-1).shape[0]},
    {'check': 'training_net_loss_decrease', 'expected': 'final < first', 'actual': f"first={train_loss_hist[0]:.6f},final={train_loss_hist[-1]:.6f}", 'pass': train_loss_hist[-1] < train_loss_hist[0]},
])

print('Training done.')


Epoch 001/200 | train_mse=3.401970 | test_mse=3.734706
Epoch 025/200 | train_mse=1.501502 | test_mse=1.422093
Epoch 050/200 | train_mse=1.481516 | test_mse=1.431584
Epoch 075/200 | train_mse=1.471347 | test_mse=1.424586
Epoch 100/200 | train_mse=1.460349 | test_mse=1.415930
Epoch 125/200 | train_mse=1.458950 | test_mse=1.426686
Epoch 150/200 | train_mse=1.451695 | test_mse=1.410563
Epoch 175/200 | train_mse=1.450789 | test_mse=1.433173
Epoch 200/200 | train_mse=1.447614 | test_mse=1.424222
Training done.


In [50]:
# 6) Metrics (regression-only): global + per-novel
train_metrics = metric_dict(y_train.reshape(-1), train_pred)
test_metrics = metric_dict(y_test.reshape(-1), test_pred)

global_metrics_df = pd.DataFrame([
    {'split': 'train', 'n_samples': int(len(train_pred)), 'n_novels': int(len(train_ids)), **train_metrics},
    {'split': 'test', 'n_samples': int(len(test_pred)), 'n_novels': int(len(test_ids)), **test_metrics},
])
global_metrics_df.to_csv(TABLE_DIR / 'global_metrics.csv', index=False)

per_rows = []
mae_ma_abs_err_by_split = {'train': [], 'test': []}
train_id_set = set(train_ids)
for p in payloads_sorted:
    Xp = (p['X'] - x_mean) / x_std_safe
    yp = p['y'].reshape(-1)
    yhat = (Xp @ W + b).reshape(-1)

    y_true_ma = moving_average_1d(yp, MA_WINDOW)
    y_pred_ma = moving_average_1d(yhat, MA_WINDOW)

    m = metric_dict(yp, yhat)
    mae_ma = float(np.mean(np.abs(y_pred_ma - y_true_ma)))
    split = 'train' if p['book_id'] in train_id_set else 'test'

    mae_ma_abs_err_by_split[split].append(np.abs(y_pred_ma - y_true_ma))
    per_rows.append({
        'book_id': p['book_id'],
        'title': p['title'],
        'processed_dir': p['processed_dir'],
        'split': split,
        'T': p['T'],
        **m,
        'mae_ma': mae_ma,
    })

per_novel_metrics_df = pd.DataFrame(per_rows).sort_values(['split', 'book_id']).reset_index(drop=True)
per_novel_metrics_df.to_csv(TABLE_DIR / 'per_novel_metrics.csv', index=False)

presentation_rows = []
for split in ['train', 'test']:
    split_abs = mae_ma_abs_err_by_split[split]
    if len(split_abs) == 0:
        mae_ma_value = np.nan
        n_samples_ma = 0
    else:
        all_abs = np.concatenate(split_abs)
        mae_ma_value = float(np.mean(all_abs))
        n_samples_ma = int(all_abs.shape[0])

    raw_mae_value = float(global_metrics_df.loc[global_metrics_df['split'] == split, 'mae'].iloc[0])
    presentation_rows.append({
        'split': split,
        'ma_window': MA_WINDOW,
        'mae_raw': raw_mae_value,
        'mae_moving_average': mae_ma_value,
        'n_samples': n_samples_ma,
    })

presentation_mae_df = pd.DataFrame(presentation_rows)
presentation_mae_df.to_csv(TABLE_DIR / 'presentation_mae.csv', index=False)

print('Global metrics:')
print(global_metrics_df)
print('')
print('Presentation MAE (raw vs moving average):')
print(presentation_mae_df)
print('')
print('Saved metrics tables.')


Global metrics:
   split  n_samples  n_novels       mse      rmse       mae        r2
0  train      17631        16  1.447614  1.203168  1.063862  0.173388
1   test       3706         4  1.424222  1.193408  1.040986  0.136427

Presentation MAE (raw vs moving average):
   split  ma_window   mae_raw  mae_moving_average  n_samples
0  train          9  1.063862            0.328034      17631
1   test          9  1.040986            0.348689       3706

Saved metrics tables.


In [51]:
# 7) Diagnostics plots
# Loss curve
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(np.arange(1, EPOCHS + 1), train_loss_hist, label='train_mse', linewidth=1.6)
ax.plot(np.arange(1, EPOCHS + 1), test_loss_hist, label='test_mse', linewidth=1.4)
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE')
ax.set_title('Training/Test MSE Across Epochs')
ax.grid(alpha=0.25)
ax.legend()
fig.tight_layout()
fig.savefig(FIG_DIR / 'train_loss_curve.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# Scatter true vs pred
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(y_train.reshape(-1), train_pred, s=8, alpha=0.15, label='train')
ax.scatter(y_test.reshape(-1), test_pred, s=12, alpha=0.35, label='test')
lo = min(float(y_train.min()), float(y_test.min()), float(train_pred.min()), float(test_pred.min()))
hi = max(float(y_train.max()), float(y_test.max()), float(train_pred.max()), float(test_pred.max()))
ax.plot([lo, hi], [lo, hi], linestyle='--', linewidth=1.2, color='black')
ax.set_xlabel('True excitement label')
ax.set_ylabel('Predicted excitement')
ax.set_title('Prediction Scatter: True vs Predicted')
ax.legend()
ax.grid(alpha=0.2)
fig.tight_layout()
fig.savefig(FIG_DIR / 'prediction_scatter_train_test.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# Residual hist
train_res = train_pred - y_train.reshape(-1)
test_res = test_pred - y_test.reshape(-1)
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(train_res, bins=60, alpha=0.45, label='train')
ax.hist(test_res, bins=60, alpha=0.55, label='test')
ax.set_xlabel('Residual (pred - true)')
ax.set_ylabel('Count')
ax.set_title('Residual Distribution')
ax.legend()
ax.grid(alpha=0.2)
fig.tight_layout()
fig.savefig(FIG_DIR / 'residual_hist_train_test.png', dpi=180, bbox_inches='tight')
plt.close(fig)

# Presentation figure: raw MAE vs moving-average MAE
fig, ax = plt.subplots(figsize=(8, 5))
plot_df = presentation_mae_df.copy().set_index('split').loc[['train', 'test']].reset_index()
x = np.arange(len(plot_df))
bar_w = 0.34
ax.bar(x - bar_w / 2, plot_df['mae_raw'], width=bar_w, label='Raw MAE', alpha=0.9)
ax.bar(x + bar_w / 2, plot_df['mae_moving_average'], width=bar_w, label=f'MA({MA_WINDOW}) MAE', alpha=0.9)
ax.set_xticks(x)
ax.set_xticklabels(plot_df['split'])
ax.set_ylabel('MAE')
ax.set_title('Train/Test MAE: Raw vs Moving Average (Presentation)')
ax.grid(axis='y', alpha=0.25)
ax.legend()
for i, row in plot_df.iterrows():
    ax.text(i - bar_w / 2, row['mae_raw'] + 0.01, f"{row['mae_raw']:.3f}", ha='center', va='bottom', fontsize=9)
    ax.text(i + bar_w / 2, row['mae_moving_average'] + 0.01, f"{row['mae_moving_average']:.3f}", ha='center', va='bottom', fontsize=9)
fig.tight_layout()
fig.savefig(FIG_DIR / 'mae_raw_vs_moving_average.png', dpi=180, bbox_inches='tight')
plt.close(fig)

print('Saved diagnostics plots.')


Saved diagnostics plots.


In [52]:
# 8) Per-novel overlay plots: all 4 test + 2 train (best/worst RMSE)
by_id = {p['book_id']: p for p in payloads_sorted}

# test overlays
for bid in test_ids:
    p = by_id[bid]
    Xp = (p['X'] - x_mean) / x_std_safe
    y_true = p['y'].reshape(-1)
    y_pred = (Xp @ W + b).reshape(-1)
    y_true_ma = moving_average_1d(y_true, MA_WINDOW)
    y_pred_ma = moving_average_1d(y_pred, MA_WINDOW)
    raw_mae = float(np.mean(np.abs(y_pred - y_true)))
    ma_mae = float(np.mean(np.abs(y_pred_ma - y_true_ma)))
    t = np.arange(p['T'])

    fig, ax = plt.subplots(figsize=(12, 4.5))
    ax.plot(t, y_true, label='llm_label_raw', linewidth=1.0, alpha=0.30, color='tab:blue')
    ax.plot(t, y_pred, label='linear_pred_raw', linewidth=1.0, alpha=0.30, color='tab:orange')
    ax.plot(t, y_true_ma, label=f'llm_label_MA{MA_WINDOW}', linewidth=2.0, color='tab:blue')
    ax.plot(t, y_pred_ma, label=f'linear_pred_MA{MA_WINDOW}', linewidth=2.0, color='tab:orange')
    ax.set_title(f"Test Novel {bid} | {p['title']} | raw MAE={raw_mae:.3f}, MA MAE={ma_mae:.3f}")
    ax.set_xlabel('Chunk index')
    ax.set_ylabel('Excitement')
    ax.set_ylim(-0.5, 4.5)
    ax.grid(alpha=0.25)
    ax.legend(ncol=2)
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'novel_overlay_test_{bid}.png', dpi=180, bbox_inches='tight')
    plt.close(fig)

# choose 2 train novels: lowest and highest train RMSE
train_novel_rmse = per_novel_metrics_df[per_novel_metrics_df['split'] == 'train'][['book_id', 'rmse']].copy()
train_best_id = int(train_novel_rmse.sort_values('rmse', ascending=True).iloc[0]['book_id'])
train_worst_id = int(train_novel_rmse.sort_values('rmse', ascending=False).iloc[0]['book_id'])
selected_train_overlay_ids = sorted(list({train_best_id, train_worst_id}))

# ensure exactly 2 train overlays even if best==worst (unlikely)
if len(selected_train_overlay_ids) == 1:
    alt_id = int(train_novel_rmse.sort_values('rmse', ascending=True).iloc[1]['book_id'])
    selected_train_overlay_ids.append(alt_id)
    selected_train_overlay_ids = sorted(selected_train_overlay_ids)

for bid in selected_train_overlay_ids:
    p = by_id[bid]
    Xp = (p['X'] - x_mean) / x_std_safe
    y_true = p['y'].reshape(-1)
    y_pred = (Xp @ W + b).reshape(-1)
    y_true_ma = moving_average_1d(y_true, MA_WINDOW)
    y_pred_ma = moving_average_1d(y_pred, MA_WINDOW)
    raw_mae = float(np.mean(np.abs(y_pred - y_true)))
    ma_mae = float(np.mean(np.abs(y_pred_ma - y_true_ma)))
    t = np.arange(p['T'])

    fig, ax = plt.subplots(figsize=(12, 4.5))
    ax.plot(t, y_true, label='llm_label_raw', linewidth=1.0, alpha=0.30, color='tab:blue')
    ax.plot(t, y_pred, label='linear_pred_raw', linewidth=1.0, alpha=0.30, color='tab:orange')
    ax.plot(t, y_true_ma, label=f'llm_label_MA{MA_WINDOW}', linewidth=2.0, color='tab:blue')
    ax.plot(t, y_pred_ma, label=f'linear_pred_MA{MA_WINDOW}', linewidth=2.0, color='tab:orange')
    ax.set_title(f"Train Novel {bid} | {p['title']} | raw MAE={raw_mae:.3f}, MA MAE={ma_mae:.3f}")
    ax.set_xlabel('Chunk index')
    ax.set_ylabel('Excitement')
    ax.set_ylim(-0.5, 4.5)
    ax.grid(alpha=0.25)
    ax.legend(ncol=2)
    fig.tight_layout()
    fig.savefig(FIG_DIR / f'novel_overlay_train_{bid}.png', dpi=180, bbox_inches='tight')
    plt.close(fig)

print('Saved per-novel overlay plots.')
print('Selected train overlays:', selected_train_overlay_ids)


Saved per-novel overlay plots.
Selected train overlays: [43, 1513]


In [53]:
# 9) Save model and finalize integrity checks
np.savez(
    MODEL_DIR / 'linear_weights.npz',
    W=W.astype(np.float32),
    b=b.astype(np.float32),
    x_mean=x_mean.reshape(-1).astype(np.float32),
    x_std=x_std_safe.reshape(-1).astype(np.float32),
    seed=np.array([SEED], dtype=np.int32),
    lr=np.array([LR], dtype=np.float32),
    epochs=np.array([EPOCHS], dtype=np.int32),
    batch_size=np.array([BATCH_SIZE], dtype=np.int32),
    weight_decay=np.array([WEIGHT_DECAY], dtype=np.float32),
)

# output completeness + overlay coverage checks
expected_files = required_output_files(test_ids=test_ids, train_ids=selected_train_overlay_ids)
missing_files = [str(p) for p in expected_files if not p.exists()]

integrity_rows.extend([
    {'check': 'output_files_complete', 'expected': len(expected_files), 'actual': len(expected_files) - len(missing_files), 'pass': len(missing_files) == 0},
    {'check': 'overlay_test_count', 'expected': 4, 'actual': len(list(FIG_DIR.glob('novel_overlay_test_*.png'))), 'pass': len(list(FIG_DIR.glob('novel_overlay_test_*.png'))) == 4},
    {'check': 'overlay_train_count', 'expected': 2, 'actual': len(list(FIG_DIR.glob('novel_overlay_train_*.png'))), 'pass': len(list(FIG_DIR.glob('novel_overlay_train_*.png'))) == 2},
])

integrity_df = pd.DataFrame(integrity_rows)
integrity_df.to_csv(TABLE_DIR / 'integrity_checks.csv', index=False)

all_pass = bool(integrity_df['pass'].all())
print(f'Saved model: {MODEL_DIR / "linear_weights.npz"}')
print(f'Saved integrity checks: {TABLE_DIR / "integrity_checks.csv"}')
print(f'Integrity all-pass: {all_pass}')
if missing_files:
    print('Missing files:')
    for m in missing_files:
        print('-', m)

print("\nGlobal metrics preview:")
print(pd.read_csv(TABLE_DIR / 'global_metrics.csv'))


Saved model: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_linear/model/linear_weights.npz
Saved integrity checks: /Users/kongfha/Desktop/Time_Series_Mining/story-trajectory-analysis/outputs/excitement_linear/tables/integrity_checks.csv
Integrity all-pass: True

Global metrics preview:
   split  n_samples  n_novels       mse      rmse       mae        r2
0  train      17631        16  1.447614  1.203168  1.063862  0.173388
1   test       3706         4  1.424222  1.193408  1.040986  0.136427
