# Lexical evolution

Track-level lexical diversity and sentence structure across albums.

In [None]:
import json
from pathlib import Path
import matplotlib.pyplot as plt

def find_root():
    here = Path.cwd()
    for p in [here, *here.parents]:
        candidate = p / 'data' / 'exports' / 'radiohead_complete.json'
        if candidate.exists():
            return p
    raise FileNotFoundError('radiohead_complete.json not found; run src/processing/ingest_csv.py')

root = find_root()
data_path = root / 'data' / 'exports' / 'radiohead_complete.json'
data = json.loads(data_path.read_text())
print(f"Loaded {len(data)} tracks from {data_path}")


In [None]:
# Aggregate by album
by_album = {}
for row in data:
    album = row["album_name"]
    by_album.setdefault(album, []).append(row)

records = []
for album, rows in by_album.items():
    ttr = sum(r.get("type_token_ratio", 0) for r in rows) / len(rows)
    avg_len = sum(r.get("avg_sentence_length", 0) for r in rows) / len(rows)
    year = rows[0]["album_year"]
    records.append({"album": album, "year": year, "ttr": ttr, "avg_sentence_length": avg_len})

records = sorted(records, key=lambda r: r["year"])
records

In [None]:
%matplotlib inline
years = [r["year"] for r in records]
ttr = [r["ttr"] for r in records]
sent_len = [r["avg_sentence_length"] for r in records]
labels = [r["album"] for r in records]

fig, ax1 = plt.subplots(figsize=(10, 4))
ax1.plot(years, ttr, marker="o", color="#10b981", label="Type-token ratio")
ax1.set_xlabel("Year")
ax1.set_ylabel("Type-token ratio", color="#10b981")
ax1.tick_params(axis='y', labelcolor="#10b981")

ax2 = ax1.twinx()
ax2.plot(years, sent_len, marker="s", color="#f59e0b", label="Avg sentence length")
ax2.set_ylabel("Tokens per sentence", color="#f59e0b")
ax2.tick_params(axis='y', labelcolor="#f59e0b")

for x, y, label in zip(years, ttr, labels):
    ax1.text(x, y + 0.01, label, fontsize=8, ha='center')

plt.title("Lexical diversity and sentence length over albums")
fig.tight_layout()
plt.show()