# Hypothesis testing sketches

Quick, non-rigorous checks for H1 (Kid A coldness) and H4 (In Rainbows outlier).

In [None]:
import json
from pathlib import Path
import numpy as np
from scipy import stats

def find_root():
    here = Path.cwd()
    for p in [here, *here.parents]:
        candidate = p / 'data' / 'exports' / 'radiohead_complete.json'
        if candidate.exists():
            return p
    raise FileNotFoundError('radiohead_complete.json not found; run src/processing/ingest_csv.py')

root = find_root()
data_path = root / 'data' / 'exports' / 'radiohead_complete.json'
data = json.loads(data_path.read_text())
print(f"Loaded {len(data)} tracks from {data_path}")


In [None]:
# Helper to collect metrics per album
by_album = {}
for row in data:
    by_album.setdefault(row["album_name"], []).append(row)

def album_metric(album, key):
    return np.array([r[key] for r in by_album[album]])

albums = sorted(by_album.keys())
albums

In [None]:
# H1: Kid A sentiment vs pre-2000 albums (Pablo, Bends, OKC)
pre2000 = np.concatenate(
    [album_metric(a, "sentiment_score") for a in ["Pablo Honey", "The Bends", "OK Computer"]]
)
kid_a = album_metric("Kid A", "sentiment_score")
stat, p = stats.mannwhitneyu(kid_a, pre2000, alternative="two-sided")
print("Kid A vs pre-2000 sentiment:")
print("kid_a mean", kid_a.mean(), "pre mean", pre2000.mean(), "U", stat, "p", p)

In [None]:
# H4: In Rainbows sentiment vs Kid A
in_rainbows = album_metric("In Rainbows", "sentiment_score")
stat2, p2 = stats.mannwhitneyu(in_rainbows, kid_a, alternative="two-sided")
print("In Rainbows vs Kid A sentiment:")
print("in_rainbows mean", in_rainbows.mean(), "kid_a mean", kid_a.mean(), "U", stat2, "p", p2)

In [None]:
# Fragmentation proxy: type-token ratio (higher = more diverse tokens per track)
kid_a_ttr = album_metric("Kid A", "type_token_ratio")
late_ttr = np.concatenate(
    [album_metric(a, "type_token_ratio") for a in ["In Rainbows", "The King of Limbs", "A Moon Shaped Pool"]]
)
stat3, p3 = stats.mannwhitneyu(kid_a_ttr, late_ttr, alternative="two-sided")
print("Kid A vs late-era type-token ratio:")
print("kid_a mean", kid_a_ttr.mean(), "late mean", late_ttr.mean(), "U", stat3, "p", p3)