# Exploratory analysis: word counts and sentiment placeholders

Quick pass using the exported JSON. Uses only the standard library and matplotlib.

In [None]:
import json
from collections import defaultdict
from pathlib import Path

def find_root():
    here = Path.cwd()
    for p in [here, *here.parents]:
        candidate = p / 'data' / 'exports' / 'radiohead_complete.json'
        if candidate.exists():
            return p
    raise FileNotFoundError('radiohead_complete.json not found; run src/processing/ingest_csv.py')

root = find_root()
data_path = root / 'data' / 'exports' / 'radiohead_complete.json'
data = json.loads(data_path.read_text())
print(f"Loaded {len(data)} tracks from {data_path}")


In [None]:
album_stats = defaultdict(lambda: {"word_sum": 0, "tracks": 0, "sentiment_sum": 0.0})
for row in data:
    album = row["album_name"]
    album_stats[album]["word_sum"] += row.get("word_count", 0)
    album_stats[album]["sentiment_sum"] += row.get("sentiment_score", 0.0)
    album_stats[album]["tracks"] += 1

records = []
for album, stats in album_stats.items():
    tracks = stats["tracks"]
    records.append(
        {
            "album": album,
            "avg_words": stats["word_sum"] / tracks,
            "avg_sentiment": stats["sentiment_sum"] / tracks,
            "tracks": tracks,
        }
    )

records = sorted(records, key=lambda r: r["album"])  # alpha sort
records

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

albums = [r["album"] for r in records]
avg_words = [r["avg_words"] for r in records]

plt.figure(figsize=(10, 4))
plt.bar(albums, avg_words, color="#3c6e71")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Average words per track")
plt.title("Radiohead lyric length by album (placeholder sentiment ready)")
plt.tight_layout()
plt.show()