# 06 - PCA Component Insights (Global + Book)

This notebook builds reproducible PCA component interpretation at corpus and book level, then links PCA dynamics to Twist Signal behavior.

## Required upstream run
Run `02_transform_and_cluster.ipynb` first to generate:
- `outputs/pca/global_pca_fit.npz`
- `outputs/pca/global_pca_fit_meta.json`
- `outputs/pca/global_pca_variance_summary.csv`

## Outputs
- `outputs/pca_analysis/tables/book_component_stats.csv`
- `outputs/pca_analysis/tables/book_component_signal_assoc.csv`
- `outputs/pca_analysis/tables/component_exemplar_chunks.csv`
- `outputs/pca_analysis/tables/component_genre_association.csv`
- `outputs/pca_analysis/tables/temporal_trend_stats.csv`
- `outputs/pca_analysis/tables/corpus_assoc_bootstrap.csv`
- `outputs/pca_analysis/tables/projection_consistency_checks.csv`
- `outputs/pca_analysis/tables/pca_integrity_checks.csv`
- `outputs/pca_analysis/figures/*.png`
- `outputs/pca_analysis/insights.md`


In [None]:
from __future__ import annotations

from pathlib import Path
import json
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

SEED = 42
rng = np.random.default_rng(SEED)
np.random.seed(SEED)

PROJECT_ROOT = Path('.').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
OUTPUTS_DIR = PROJECT_ROOT / 'outputs'
PCA_DIR = OUTPUTS_DIR / 'pca'
ANALYSIS_DIR = OUTPUTS_DIR / 'pca_analysis'
TABLE_DIR = ANALYSIS_DIR / 'tables'
FIG_DIR = ANALYSIS_DIR / 'figures'

for d in [ANALYSIS_DIR, TABLE_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

METADATA_PATH = DATA_DIR / 'metadata.csv'
GLOBAL_PCA_PATH = PCA_DIR / 'global_pca_fit.npz'
GLOBAL_PCA_META_PATH = PCA_DIR / 'global_pca_fit_meta.json'
GLOBAL_PCA_VAR_PATH = PCA_DIR / 'global_pca_variance_summary.csv'

k_values = [5, 7, 11]
primary_k = 7
min_exemplar_distance = 5
max_exemplars_per_book = 3
top_exemplars_per_direction = 15
n_perm = 1000
n_boot = 2000

print(f'Project root: {PROJECT_ROOT}')
print(f'Processed dir: {PROCESSED_DIR}')
print(f'PCA artifact dir: {PCA_DIR}')
print(f'Analysis output dir: {ANALYSIS_DIR}')


In [None]:
def corr_safe(x: np.ndarray, y: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if len(x) != len(y) or len(x) < 3:
        return np.nan
    sx = np.std(x)
    sy = np.std(y)
    if sx == 0 or sy == 0:
        return np.nan
    return float(np.corrcoef(x, y)[0, 1])


def bh_adjust(pvals: np.ndarray) -> np.ndarray:
    pvals = np.asarray(pvals, dtype=float)
    out = np.full_like(pvals, np.nan)
    mask = np.isfinite(pvals)
    if not np.any(mask):
        return out

    p = pvals[mask]
    m = len(p)
    order = np.argsort(p)
    ranked = p[order]
    q = ranked * m / (np.arange(1, m + 1))
    q = np.minimum.accumulate(q[::-1])[::-1]
    q = np.clip(q, 0.0, 1.0)

    restored = np.empty_like(q)
    restored[order] = q
    out[mask] = restored
    return out


def permutation_corr_pvalue(x: np.ndarray, y: np.ndarray, n_perm: int, rng: np.random.Generator) -> tuple[float, float]:
    obs = corr_safe(x, y)
    if not np.isfinite(obs):
        return np.nan, np.nan

    y = np.asarray(y, dtype=float)
    null = np.empty(n_perm, dtype=float)
    for i in range(n_perm):
        null[i] = corr_safe(x, rng.permutation(y))

    null = null[np.isfinite(null)]
    if len(null) == 0:
        return obs, np.nan

    p = (np.sum(np.abs(null) >= abs(obs)) + 1) / (len(null) + 1)
    return obs, float(p)


def bootstrap_median_ci(values: np.ndarray, n_boot: int, rng: np.random.Generator, alpha: float = 0.05) -> tuple[float, float, float]:
    values = np.asarray(values, dtype=float)
    values = values[np.isfinite(values)]
    if len(values) == 0:
        return np.nan, np.nan, np.nan

    obs = float(np.median(values))
    boot = np.empty(n_boot, dtype=float)
    n = len(values)
    for i in range(n_boot):
        sample = rng.choice(values, size=n, replace=True)
        boot[i] = np.median(sample)

    lo = float(np.quantile(boot, alpha / 2))
    hi = float(np.quantile(boot, 1 - alpha / 2))
    return obs, lo, hi


def sign_change_rate(x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if len(x) < 2:
        return np.nan
    s = np.sign(x)
    valid = (s[:-1] != 0) & (s[1:] != 0)
    if not np.any(valid):
        return 0.0
    changes = (s[:-1][valid] != s[1:][valid]).sum()
    return float(changes / valid.sum())


def ensure_required_artifacts() -> None:
    required = [GLOBAL_PCA_PATH, GLOBAL_PCA_META_PATH, GLOBAL_PCA_VAR_PATH, METADATA_PATH]
    missing = [p for p in required if not p.exists()]
    if missing:
        missing_txt = "\n".join(f"- {m}" for m in missing)
        raise FileNotFoundError(
            "Missing required upstream PCA artifacts. Run 02_transform_and_cluster.ipynb first.\n" + missing_txt
        )


In [None]:
ensure_required_artifacts()

meta = pd.read_csv(METADATA_PATH)
meta['id'] = meta['id'].astype(int)
meta = meta.sort_values('id').reset_index(drop=True)

with np.load(GLOBAL_PCA_PATH) as pca_npz:
    components = pca_npz['components']
    explained_variance = pca_npz['explained_variance']
    explained_variance_ratio = pca_npz['explained_variance_ratio']
    singular_values = pca_npz['singular_values']
    mean_vector = pca_npz['mean']

pca_meta = json.loads(GLOBAL_PCA_META_PATH.read_text(encoding='utf-8'))
variance_df = pd.read_csv(GLOBAL_PCA_VAR_PATH)

print('Loaded global PCA artifacts:')
print(f'- components: {components.shape}')
print(f'- explained_variance_ratio: {explained_variance_ratio.shape}')
print(f'- mean vector: {mean_vector.shape}')
print(f'- metadata keys: {sorted(pca_meta.keys())}')

integrity_rows = []
integrity_rows.append({'check': 'components_rows_eq_5', 'passed': bool(components.shape[0] == 5), 'detail': str(components.shape)})
integrity_rows.append({'check': 'mean_matches_embedding_dim', 'passed': bool(mean_vector.shape[0] == components.shape[1]), 'detail': f"mean={mean_vector.shape[0]}, emb_dim={components.shape[1]}"})
integrity_rows.append({'check': 'evr_in_0_1', 'passed': bool(np.all((explained_variance_ratio >= 0) & (explained_variance_ratio <= 1))), 'detail': f"min={float(np.min(explained_variance_ratio)):.6f}, max={float(np.max(explained_variance_ratio)):.6f}"})
integrity_rows.append({'check': 'evr_sum_positive', 'passed': bool(float(np.sum(explained_variance_ratio)) > 0), 'detail': f"sum={float(np.sum(explained_variance_ratio)):.6f}"})
integrity_rows.append({'check': 'evr_cumulative_monotonic', 'passed': bool(np.all(np.diff(variance_df['cumulative_explained_variance_ratio'].to_numpy()) >= -1e-12)), 'detail': 'checked global_pca_variance_summary.csv'})


In [None]:
book_records = []
artifact_issue_rows = []

for _, mrow in meta.iterrows():
    book_id = int(mrow['id'])
    processed_dir = str(mrow['processed_dir'])
    bdir = PROCESSED_DIR / processed_dir

    record = {
        'book_id': book_id,
        'processed_dir': processed_dir,
        'title': str(mrow.get('title', '')),
        'genre_primary': str(mrow.get('genre_primary', '')),
        'book_dir': bdir,
    }

    pca_path = bdir / 'pca_d5.npy'
    chunks_path = bdir / 'chunks.jsonl'

    if not pca_path.exists():
        artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': 'missing_pca_d5', 'severity': 'error'})
        continue
    if not chunks_path.exists():
        artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': 'missing_chunks_jsonl', 'severity': 'error'})
        continue

    z5 = np.load(pca_path)
    if z5.ndim != 2 or z5.shape[1] != 5:
        artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': f'invalid_pca_shape_{z5.shape}', 'severity': 'error'})
        continue

    chunk_previews = []
    with open(chunks_path, 'r', encoding='utf-8') as f:
        for line in f:
            obj = json.loads(line)
            chunk_previews.append(str(obj.get('text_preview', '')))

    if len(chunk_previews) != len(z5):
        artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': f'chunks_vs_pca_length_mismatch_{len(chunk_previews)}_{len(z5)}', 'severity': 'error'})
        continue

    signals = {}
    for k in k_values:
        sp = bdir / f'signals_k{k}.npz'
        if not sp.exists():
            artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': f'missing_signals_k{k}', 'severity': 'warning'})
            signals[k] = None
            continue

        dat = np.load(sp)
        s = dat['s']
        a = dat['a']
        if len(s) != len(z5) or len(a) != len(z5):
            artifact_issue_rows.append({'book_id': book_id, 'processed_dir': processed_dir, 'issue': f'signal_length_mismatch_k{k}_T{len(z5)}_s{len(s)}_a{len(a)}', 'severity': 'warning'})
            signals[k] = None
            continue

        signals[k] = {'s': s.astype(np.float64), 'a': a.astype(np.float64)}

    record['z5'] = z5.astype(np.float64)
    record['chunk_previews'] = chunk_previews
    record['signals'] = signals
    book_records.append(record)

artifact_issues_df = pd.DataFrame(artifact_issue_rows)
if artifact_issues_df.empty:
    artifact_issues_df = pd.DataFrame(columns=['book_id', 'processed_dir', 'issue', 'severity'])
artifact_issues_path = TABLE_DIR / 'book_artifact_integrity.csv'
artifact_issues_df.to_csv(artifact_issues_path, index=False)

print(f'Books loaded for PCA analysis: {len(book_records)}')
print(f'Artifact issues saved: {artifact_issues_path}')
display(artifact_issues_df.head(20))


In [None]:
# Corpus-level PCA diagnostics
sns.set_theme(style='whitegrid')

variance_plot_df = pd.DataFrame({
    'pc': [f'PC{i}' for i in range(1, len(explained_variance_ratio) + 1)],
    'explained_variance_ratio': explained_variance_ratio,
    'cumulative_explained_variance_ratio': np.cumsum(explained_variance_ratio),
})

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].bar(variance_plot_df['pc'], variance_plot_df['explained_variance_ratio'], color='#2a9d8f')
ax[0].set_title('PCA Explained Variance Ratio (PC1-PC5)')
ax[0].set_ylabel('Explained Variance Ratio')

ax[1].plot(variance_plot_df['pc'], variance_plot_df['cumulative_explained_variance_ratio'], marker='o', color='#e76f51')
ax[1].set_title('Cumulative Explained Variance (PC1-PC5)')
ax[1].set_ylabel('Cumulative EVR')
ax[1].set_ylim(0, 1)

fig.tight_layout()
fig.savefig(FIG_DIR / 'pca_variance_diagnostics.png', dpi=160, bbox_inches='tight')
plt.show()

score_rows = []
book_mean_rows = []
for rec in book_records:
    z = rec['z5']
    pos = np.linspace(0.0, 1.0, len(z))
    for i in range(len(z)):
        for pc in range(5):
            score_rows.append({
                'book_id': rec['book_id'],
                'processed_dir': rec['processed_dir'],
                'title': rec['title'],
                'genre_primary': rec['genre_primary'],
                'chunk_index': i,
                'position_norm': float(pos[i]),
                'pc': f'PC{pc+1}',
                'score': float(z[i, pc]),
            })

    book_mean_rows.append({
        'book_id': rec['book_id'],
        'processed_dir': rec['processed_dir'],
        'title': rec['title'],
        'genre_primary': rec['genre_primary'],
        **{f'mean_pc{pc+1}': float(np.mean(z[:, pc])) for pc in range(5)}
    })

score_long_df = pd.DataFrame(score_rows)
book_mean_df = pd.DataFrame(book_mean_rows)

plt.figure(figsize=(12, 5))
sns.boxplot(data=score_long_df, x='pc', y='score', color='#8ecae6', fliersize=1)
plt.title('Chunk-Level PCA Score Distribution by Component')
plt.tight_layout()
plt.savefig(FIG_DIR / 'component_score_distributions.png', dpi=160, bbox_inches='tight')
plt.show()

pair_df = book_mean_df[['genre_primary', 'mean_pc1', 'mean_pc2', 'mean_pc3']].copy()
pair_df = pair_df.rename(columns={'mean_pc1': 'PC1', 'mean_pc2': 'PC2', 'mean_pc3': 'PC3'})
g = sns.pairplot(pair_df, vars=['PC1', 'PC2', 'PC3'], hue='genre_primary', diag_kind='hist', plot_kws={'alpha': 0.7, 's': 40})
g.fig.suptitle('Book-Level Mean PCA Scores by Genre (PC1-PC3)', y=1.02)
g.savefig(FIG_DIR / 'component_pairwise_by_genre.png', dpi=160, bbox_inches='tight')
plt.show()


In [None]:
# Temporal trend significance with permutation tests + BH correction by component family
trend_rows = []
for rec in book_records:
    z = rec['z5']
    pos = np.linspace(0.0, 1.0, len(z))
    for pc in range(5):
        corr_obs, p_perm = permutation_corr_pvalue(z[:, pc], pos, n_perm=n_perm, rng=rng)
        trend_rows.append({
            'book_id': rec['book_id'],
            'processed_dir': rec['processed_dir'],
            'title': rec['title'],
            'genre_primary': rec['genre_primary'],
            'pc': f'PC{pc+1}',
            'corr_pc_position': corr_obs,
            'perm_pvalue': p_perm,
        })

trend_df = pd.DataFrame(trend_rows)
trend_df['perm_qvalue'] = np.nan
for pc in sorted(trend_df['pc'].unique()):
    mask = trend_df['pc'] == pc
    trend_df.loc[mask, 'perm_qvalue'] = bh_adjust(trend_df.loc[mask, 'perm_pvalue'].to_numpy())

trend_df.to_csv(TABLE_DIR / 'temporal_trend_stats.csv', index=False)

plt.figure(figsize=(10, 5))
sns.boxplot(data=trend_df, x='pc', y='corr_pc_position', color='#f4a261')
plt.axhline(0.0, color='black', linewidth=1)
plt.title('Per-Book Temporal Trend Correlations by PCA Component')
plt.tight_layout()
plt.savefig(FIG_DIR / 'temporal_trend_summary.png', dpi=160, bbox_inches='tight')
plt.show()

display(trend_df.head(15))


In [None]:
# Component semantics via constrained exemplar selection
candidate_rows = []
for rec in book_records:
    z = rec['z5']
    for i, preview in enumerate(rec['chunk_previews']):
        for pc in range(5):
            candidate_rows.append({
                'book_id': rec['book_id'],
                'processed_dir': rec['processed_dir'],
                'title': rec['title'],
                'genre_primary': rec['genre_primary'],
                'chunk_index': i,
                'pc': f'PC{pc+1}',
                'score': float(z[i, pc]),
                'text_preview': preview,
            })

cand_df = pd.DataFrame(candidate_rows)


def select_exemplars(df: pd.DataFrame, direction: str, top_n: int, min_dist: int, max_per_book: int) -> pd.DataFrame:
    if direction == 'positive':
        ordered = df.sort_values('score', ascending=False)
    else:
        ordered = df.sort_values('score', ascending=True)

    selected = []
    per_book_count = {}
    per_book_indices = {}

    for _, row in ordered.iterrows():
        bid = int(row['book_id'])
        idx = int(row['chunk_index'])

        if per_book_count.get(bid, 0) >= max_per_book:
            continue

        prev_idx = per_book_indices.get(bid, [])
        if any(abs(idx - pidx) < min_dist for pidx in prev_idx):
            continue

        selected.append(row.to_dict())
        per_book_count[bid] = per_book_count.get(bid, 0) + 1
        per_book_indices.setdefault(bid, []).append(idx)

        if len(selected) >= top_n:
            break

    out = pd.DataFrame(selected)
    if not out.empty:
        out['direction'] = direction
    return out


exemplar_frames = []
for pc in [f'PC{i}' for i in range(1, 6)]:
    pc_df = cand_df[cand_df['pc'] == pc]
    pos = select_exemplars(pc_df, 'positive', top_exemplars_per_direction, min_exemplar_distance, max_exemplars_per_book)
    neg = select_exemplars(pc_df, 'negative', top_exemplars_per_direction, min_exemplar_distance, max_exemplars_per_book)
    exemplar_frames.extend([pos, neg])

exemplar_df = pd.concat([f for f in exemplar_frames if not f.empty], ignore_index=True)
exemplar_df = exemplar_df[['book_id', 'processed_dir', 'title', 'genre_primary', 'chunk_index', 'pc', 'direction', 'score', 'text_preview']]
exemplar_df.to_csv(TABLE_DIR / 'component_exemplar_chunks.csv', index=False)

print(f'Saved: {TABLE_DIR / "component_exemplar_chunks.csv"}')
display(exemplar_df.head(20))


In [None]:
# Book-level PCA metrics and signal associations (k=5,7,11)
book_stats_rows = []
assoc_rows = []
assoc_issue_rows = []

for rec in book_records:
    z = rec['z5']
    dz = np.diff(z, axis=0)
    speed = np.linalg.norm(dz, axis=1)
    pos = np.linspace(0.0, 1.0, len(z))

    row = {
        'book_id': rec['book_id'],
        'processed_dir': rec['processed_dir'],
        'title': rec['title'],
        'genre_primary': rec['genre_primary'],
        'T': len(z),
        'mean_speed': float(np.mean(speed)) if len(speed) else np.nan,
        'p95_speed': float(np.quantile(speed, 0.95)) if len(speed) else np.nan,
        'speed_std': float(np.std(speed)) if len(speed) else np.nan,
    }

    for pc in range(5):
        x = z[:, pc]
        row[f'mean_pc{pc+1}'] = float(np.mean(x))
        row[f'std_pc{pc+1}'] = float(np.std(x))
        row[f'corr_pc{pc+1}_position'] = corr_safe(x, pos)
        row[f'sign_change_rate_pc{pc+1}'] = sign_change_rate(x)

    row['sign_change_rate_mean'] = float(np.nanmean([row[f'sign_change_rate_pc{i}'] for i in range(1, 6)]))
    book_stats_rows.append(row)

    for k in k_values:
        sig = rec['signals'].get(k)
        if sig is None:
            assoc_issue_rows.append({'book_id': rec['book_id'], 'processed_dir': rec['processed_dir'], 'k': k, 'issue': 'missing_or_invalid_signal', 'severity': 'warning'})
            continue

        s = sig['s']
        a = sig['a']
        if len(speed) != len(s) - 1 or len(speed) != len(a) - 1:
            assoc_issue_rows.append({'book_id': rec['book_id'], 'processed_dir': rec['processed_dir'], 'k': k, 'issue': 'length_mismatch_speed_signal', 'severity': 'warning'})
            continue

        assoc_rows.append({
            'book_id': rec['book_id'],
            'processed_dir': rec['processed_dir'],
            'title': rec['title'],
            'genre_primary': rec['genre_primary'],
            'k': k,
            'T': len(z),
            'corr_speed_s': corr_safe(speed, s[1:]),
            'corr_speed_a': corr_safe(speed, a[1:]),
            'mean_speed': float(np.mean(speed)),
            'p95_speed': float(np.quantile(speed, 0.95)),
        })

book_stats_df = pd.DataFrame(book_stats_rows)
book_stats_df.to_csv(TABLE_DIR / 'book_component_stats.csv', index=False)

assoc_df = pd.DataFrame(assoc_rows)
if assoc_df.empty:
    assoc_df = pd.DataFrame(columns=['book_id', 'processed_dir', 'title', 'genre_primary', 'k', 'T', 'corr_speed_s', 'corr_speed_a', 'mean_speed', 'p95_speed'])
assoc_df.to_csv(TABLE_DIR / 'book_component_signal_assoc.csv', index=False)

assoc_issues_df = pd.DataFrame(assoc_issue_rows)
if assoc_issues_df.empty:
    assoc_issues_df = pd.DataFrame(columns=['book_id', 'processed_dir', 'k', 'issue', 'severity'])
assoc_issues_df.to_csv(TABLE_DIR / 'book_signal_assoc_issues.csv', index=False)

# Genre-level component association with effect-size style normalization
long_rows = []
for _, r in book_stats_df.iterrows():
    for pc in range(1, 6):
        long_rows.append({
            'book_id': int(r['book_id']),
            'processed_dir': r['processed_dir'],
            'genre_primary': r['genre_primary'],
            'pc': f'PC{pc}',
            'mean_score': float(r[f'mean_pc{pc}']),
        })

long_component_df = pd.DataFrame(long_rows)
corpus_stats = long_component_df.groupby('pc')['mean_score'].agg(['mean', 'std']).reset_index().rename(columns={'mean': 'corpus_mean', 'std': 'corpus_std'})
genre_stats = long_component_df.groupby(['genre_primary', 'pc'])['mean_score'].agg(['mean', 'count']).reset_index().rename(columns={'mean': 'genre_mean', 'count': 'genre_book_count'})
component_genre_assoc = genre_stats.merge(corpus_stats, on='pc', how='left')
component_genre_assoc['delta_vs_corpus_mean'] = component_genre_assoc['genre_mean'] - component_genre_assoc['corpus_mean']
component_genre_assoc['effect_size_vs_corpus'] = component_genre_assoc['delta_vs_corpus_mean'] / component_genre_assoc['corpus_std'].replace(0, np.nan)
component_genre_assoc.to_csv(TABLE_DIR / 'component_genre_association.csv', index=False)

print(f'Saved: {TABLE_DIR / "book_component_stats.csv"}')
print(f'Saved: {TABLE_DIR / "book_component_signal_assoc.csv"}')
print(f'Saved: {TABLE_DIR / "component_genre_association.csv"}')
display(book_stats_df.head(10))
display(assoc_df.head(10))


In [None]:
# Moderate rigor checks: bootstrap CIs for corpus-level median signal associations
boot_rows = []
for k in sorted(assoc_df['k'].unique()):
    sub = assoc_df[assoc_df['k'] == k]

    med_s, lo_s, hi_s = bootstrap_median_ci(sub['corr_speed_s'].to_numpy(), n_boot=n_boot, rng=rng)
    med_a, lo_a, hi_a = bootstrap_median_ci(sub['corr_speed_a'].to_numpy(), n_boot=n_boot, rng=rng)

    boot_rows.append({
        'k': int(k),
        'metric': 'corr_speed_s',
        'median': med_s,
        'ci_lower': lo_s,
        'ci_upper': hi_s,
        'n_books': int(sub['book_id'].nunique()),
    })
    boot_rows.append({
        'k': int(k),
        'metric': 'corr_speed_a',
        'median': med_a,
        'ci_lower': lo_a,
        'ci_upper': hi_a,
        'n_books': int(sub['book_id'].nunique()),
    })

boot_df = pd.DataFrame(boot_rows)
boot_df.to_csv(TABLE_DIR / 'corpus_assoc_bootstrap.csv', index=False)

plt.figure(figsize=(10, 5))
plot_df = boot_df.copy()
plot_df['label'] = plot_df['metric'] + '_k' + plot_df['k'].astype(str)
plot_df = plot_df.sort_values(['metric', 'k'])
plt.errorbar(
    x=np.arange(len(plot_df)),
    y=plot_df['median'],
    yerr=[plot_df['median'] - plot_df['ci_lower'], plot_df['ci_upper'] - plot_df['median']],
    fmt='o',
    capsize=4,
)
plt.xticks(np.arange(len(plot_df)), plot_df['label'], rotation=45, ha='right')
plt.title('Bootstrap 95% CI: Corpus Median PCA-Speed / Signal Associations')
plt.tight_layout()
plt.savefig(FIG_DIR / 'bootstrap_assoc_summary.png', dpi=160, bbox_inches='tight')
plt.show()

# Book deep-dive figure (primary k=7)
k7 = assoc_df[assoc_df['k'] == primary_k].copy()
if not k7.empty:
    top_books = k7.sort_values('corr_speed_a', ascending=False).head(4)['book_id'].tolist()
    fig, axes = plt.subplots(len(top_books), 1, figsize=(12, 3.5 * len(top_books)), sharex=False)
    if len(top_books) == 1:
        axes = [axes]

    lookup = {r['book_id']: r for r in book_records}
    for ax, bid in zip(axes, top_books):
        rec = lookup[int(bid)]
        z = rec['z5']
        speed = np.linalg.norm(np.diff(z, axis=0), axis=1)
        sig = rec['signals'][primary_k]

        ax.plot(np.arange(1, len(speed) + 1), speed, label='pca_speed', linewidth=1.2)
        ax.plot(sig['s'], label='s_t', linewidth=1.0, alpha=0.8)
        ax.plot(sig['a'], label='a_t', linewidth=1.0, alpha=0.8)
        ax.set_title(f"Book {rec['book_id']} | {rec['title']} (k={primary_k})")
        ax.set_xlabel('Chunk index')
        ax.legend(loc='upper right', ncol=3)

    fig.tight_layout()
    fig.savefig(FIG_DIR / 'book_deep_dive_speed_signal_k7.png', dpi=160, bbox_inches='tight')
    plt.show()


In [None]:
# Projection consistency checks against stored pca_d5.npy
sample_books = [r['book_id'] for r in sorted(book_records, key=lambda x: x['book_id'])[:5]]
proj_rows = []

for bid in sample_books:
    rec = next(r for r in book_records if r['book_id'] == bid)
    emb_path = rec['book_dir'] / 'embeddings.npy'
    if not emb_path.exists():
        proj_rows.append({'book_id': bid, 'processed_dir': rec['processed_dir'], 'checked': False, 'max_abs_diff': np.nan, 'mean_abs_diff': np.nan, 'detail': 'missing_embeddings.npy'})
        continue

    emb = np.load(emb_path).astype(np.float64)
    z_recomputed = (emb - mean_vector.astype(np.float64)) @ components.astype(np.float64).T
    z_saved = rec['z5']

    if z_recomputed.shape != z_saved.shape:
        proj_rows.append({'book_id': bid, 'processed_dir': rec['processed_dir'], 'checked': False, 'max_abs_diff': np.nan, 'mean_abs_diff': np.nan, 'detail': f'shape_mismatch_recomputed_{z_recomputed.shape}_saved_{z_saved.shape}'})
        continue

    diff = np.abs(z_recomputed - z_saved)
    proj_rows.append({
        'book_id': bid,
        'processed_dir': rec['processed_dir'],
        'checked': True,
        'max_abs_diff': float(np.max(diff)),
        'mean_abs_diff': float(np.mean(diff)),
        'detail': 'ok',
    })

projection_df = pd.DataFrame(proj_rows)
projection_df.to_csv(TABLE_DIR / 'projection_consistency_checks.csv', index=False)

# Integrity checks required by plan
required_assoc_rows = len(book_records) * len(k_values)
assoc_rows_actual = len(assoc_df)
assoc_no_nan = bool(assoc_df[['corr_speed_s', 'corr_speed_a', 'mean_speed', 'p95_speed']].isna().sum().sum() == 0) if not assoc_df.empty else False

pvals_ok = bool(((trend_df['perm_pvalue'].dropna() >= 0) & (trend_df['perm_pvalue'].dropna() <= 1)).all())
qvals_ok = bool(((trend_df['perm_qvalue'].dropna() >= 0) & (trend_df['perm_qvalue'].dropna() <= 1)).all())
ci_order_ok = bool((boot_df['ci_lower'] <= boot_df['ci_upper']).all()) if not boot_df.empty else False

ex_counts = exemplar_df.groupby(['pc', 'direction']).size().reset_index(name='n')
expected_pairs = {(f'PC{i}', d) for i in range(1, 6) for d in ['positive', 'negative']}
actual_pairs = set(zip(ex_counts['pc'], ex_counts['direction']))
all_pairs_present = expected_pairs.issubset(actual_pairs)

# De-dup enforcement checks
rule_violations = []
for (pc, direction, book_id), grp in exemplar_df.groupby(['pc', 'direction', 'book_id']):
    if len(grp) > max_exemplars_per_book:
        rule_violations.append(f'{pc}-{direction}-book{book_id}:too_many')
    idx_sorted = sorted(grp['chunk_index'].astype(int).tolist())
    for i in range(1, len(idx_sorted)):
        if abs(idx_sorted[i] - idx_sorted[i - 1]) < min_exemplar_distance:
            rule_violations.append(f'{pc}-{direction}-book{book_id}:distance_violation')

end_to_end_files = [
    TABLE_DIR / 'book_component_stats.csv',
    TABLE_DIR / 'book_component_signal_assoc.csv',
    TABLE_DIR / 'component_exemplar_chunks.csv',
    TABLE_DIR / 'component_genre_association.csv',
    TABLE_DIR / 'temporal_trend_stats.csv',
    TABLE_DIR / 'corpus_assoc_bootstrap.csv',
    TABLE_DIR / 'projection_consistency_checks.csv',
]

integrity_rows.extend([
    {'check': 'projection_consistency_checked_books', 'passed': bool(projection_df['checked'].all()), 'detail': f"rows={len(projection_df)}"},
    {'check': 'projection_consistency_max_abs_diff_lt_1e-4', 'passed': bool((projection_df['max_abs_diff'].dropna() < 1e-4).all()), 'detail': f"max={projection_df['max_abs_diff'].dropna().max() if not projection_df['max_abs_diff'].dropna().empty else np.nan}"},
    {'check': 'association_row_count_full_coverage', 'passed': bool(assoc_rows_actual == required_assoc_rows), 'detail': f"actual={assoc_rows_actual}, expected={required_assoc_rows}"},
    {'check': 'association_no_nan_core_numeric', 'passed': assoc_no_nan, 'detail': 'corr_speed_s/corr_speed_a/mean_speed/p95_speed'},
    {'check': 'perm_pvalues_in_0_1', 'passed': pvals_ok, 'detail': 'trend_df perm_pvalue bounds'},
    {'check': 'perm_qvalues_in_0_1', 'passed': qvals_ok, 'detail': 'trend_df perm_qvalue bounds'},
    {'check': 'bootstrap_ci_ordered', 'passed': ci_order_ok, 'detail': 'ci_lower <= ci_upper'},
    {'check': 'exemplar_pairs_present_pc1to5_posneg', 'passed': all_pairs_present, 'detail': f"pairs={len(actual_pairs)}"},
    {'check': 'exemplar_dedup_rules_enforced', 'passed': bool(len(rule_violations) == 0), 'detail': ';'.join(rule_violations[:5]) if rule_violations else 'ok'},
    {'check': 'end_to_end_required_files_exist', 'passed': bool(all(p.exists() for p in end_to_end_files)), 'detail': 'verified table outputs'},
])

integrity_df = pd.DataFrame(integrity_rows)
integrity_df.to_csv(TABLE_DIR / 'pca_integrity_checks.csv', index=False)

display(projection_df)
display(integrity_df)
print(f'Saved: {TABLE_DIR / "pca_integrity_checks.csv"}')


In [None]:
# Build narrative insights markdown
k7 = assoc_df[assoc_df['k'] == primary_k].copy()

if k7.empty:
    raise RuntimeError('No k=7 association rows available; cannot build insights narrative.')

high_vol = book_stats_df.sort_values('mean_speed', ascending=False).head(3)
high_coupling = k7.sort_values('corr_speed_a', ascending=False).head(3)

trend_abs = trend_df.copy()
trend_abs['abs_corr'] = trend_abs['corr_pc_position'].abs()
atypical = trend_abs.sort_values('abs_corr', ascending=False).head(3)

sens = assoc_df.groupby('k')[['corr_speed_s', 'corr_speed_a']].median().reset_index()

lines = []
lines.append('# PCA Component Insights (Global + Book)')
lines.append('')
lines.append('## Corpus-Level Component Diagnostics')
for i, evr in enumerate(explained_variance_ratio, start=1):
    lines.append(f'- PC{i} explained_variance_ratio={evr:.4f} | cumulative={np.sum(explained_variance_ratio[:i]):.4f}')
lines.append('')

lines.append('## Component Semantics (Exemplar-Based)')
for pc in [f'PC{i}' for i in range(1, 6)]:
    pos = exemplar_df[(exemplar_df['pc'] == pc) & (exemplar_df['direction'] == 'positive')].head(2)
    neg = exemplar_df[(exemplar_df['pc'] == pc) & (exemplar_df['direction'] == 'negative')].head(2)
    lines.append(f'- {pc}:')
    for _, r in pos.iterrows():
        preview = ' '.join(str(r['text_preview']).split())[:140]
        lines.append(f"  + positive exemplar: {r['book_id']} | {r['title']} | score={r['score']:.3f} | {preview}")
    for _, r in neg.iterrows():
        preview = ' '.join(str(r['text_preview']).split())[:140]
        lines.append(f"  - negative exemplar: {r['book_id']} | {r['title']} | score={r['score']:.3f} | {preview}")
lines.append('')

lines.append('## Book-Level Highlights')
lines.append('Highest PCA trajectory volatility (mean_speed):')
for _, r in high_vol.iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | mean_speed={r['mean_speed']:.4f} | p95_speed={r['p95_speed']:.4f}")
lines.append('')

lines.append(f'Strongest PCA-speed / acceleration coupling (k={primary_k}):')
for _, r in high_coupling.iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | corr_speed_a={r['corr_speed_a']:.4f} | corr_speed_s={r['corr_speed_s']:.4f}")
lines.append('')

lines.append('Most atypical temporal component trends (|corr(PC, position)|):')
for _, r in atypical.iterrows():
    lines.append(f"- {int(r['book_id'])} | {r['title']} | {r['pc']} corr={r['corr_pc_position']:.4f} | q={r['perm_qvalue']:.4f}")
lines.append('')

lines.append('## Sensitivity Across k = 5, 7, 11')
for _, r in sens.iterrows():
    lines.append(f"- k={int(r['k'])}: median corr_speed_s={r['corr_speed_s']:.4f}, median corr_speed_a={r['corr_speed_a']:.4f}")
lines.append('')

lines.append('## Caveats')
lines.append('- PCA components are derived from embedding geometry and require semantic triangulation with text exemplars.')
lines.append('- Association metrics are correlational and do not establish causal narrative mechanisms.')
lines.append('- Missing/invalid signal artifacts are skipped and logged in integrity tables.')
lines.append('')

insights_path = ANALYSIS_DIR / 'insights.md'
insights_path.write_text("\n".join(lines) + "\n", encoding='utf-8')

print(f'Saved: {insights_path}')
print("\n".join(lines[:40]))
