# Topic modeling (quick LDA sketch)

Rough LDA on the Kaggle lyrics export to surface dominant terms.

In [None]:
import json
from pathlib import Path
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def find_root():
    here = Path.cwd()
    for p in [here, *here.parents]:
        candidate = p / 'data' / 'exports' / 'radiohead_complete.json'
        if candidate.exists():
            return p
    raise FileNotFoundError('radiohead_complete.json not found; run src/processing/ingest_csv.py')

root = find_root()
data_path = root / 'data' / 'exports' / 'radiohead_complete.json'
data = json.loads(data_path.read_text())
print(f"Loaded {len(data)} tracks from {data_path}")


In [None]:
# Vectorize
vectorizer = CountVectorizer(stop_words="english", max_features=2000, min_df=2)
X = vectorizer.fit_transform(corpus)

lda = LatentDirichletAllocation(n_components=5, random_state=42, learning_method="online")
lda.fit(X)
feature_names = vectorizer.get_feature_names_out()

def top_words(component, n=10):
    indices = component.argsort()[-n:][::-1]
    return [feature_names[i] for i in indices]

topics = [top_words(comp) for comp in lda.components_]
topics

In [None]:
# Album-topic mix
import numpy as np

doc_topic = lda.transform(X)
albums = [row["album_name"] for row in data]
album_ids = {a: i for i, a in enumerate(sorted(set(albums)))}
album_topic = np.zeros((len(album_ids), doc_topic.shape[1]))

for doc_idx, album in enumerate(albums):
    album_topic[album_ids[album]] += doc_topic[doc_idx]

album_topic = album_topic / album_topic.sum(axis=1, keepdims=True)
album_names_sorted = sorted(album_ids.keys())

summary = []
for i, album in enumerate(album_names_sorted):
    top_topic = album_topic[i].argmax()
    summary.append({"album": album, "top_topic": int(top_topic), "weight": round(album_topic[i][top_topic], 3)})
summary