# Taxonomy Derivation

Discovers natural feedback categories from open-ended teacher descriptions via
sentence-transformer embedding and HDBSCAN clustering, then scores, selects,
and organizes them into a hierarchy.

In [None]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import sys
sys.path.insert(0, 'src')

import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_DIR = Path("data/masterclass_pipeline")
CACHE_DIR = Path("data/masterclass_cache")
OUTPUT_DIR = Path("data/composite_labels")
OUTPUT_DIR.mkdir(exist_ok=True)

## Load open-ended descriptions

In [None]:
from masterclass_experiments.clustering import load_open_descriptions

moment_ids, descriptions = load_open_descriptions(DATA_DIR / "open_moments.jsonl")
print(f"Loaded {len(descriptions)} open descriptions")

## Embed and cluster

In [None]:
from masterclass_experiments.clustering import embed_descriptions, cluster_descriptions, summarize_clusters

embeddings = embed_descriptions(descriptions)
print(f"Embeddings shape: {embeddings.shape}")

labels, clusterer = cluster_descriptions(embeddings, min_cluster_size=15)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = (labels == -1).sum()
print(f"Found {n_clusters} clusters, {n_noise} noise points ({n_noise/len(labels)*100:.1f}%)")

summary = summarize_clusters(descriptions, labels)
for s in summary:
    print(f"\nCluster {s['cluster_id']} (n={s['size']}, freq={s['frequency']:.1%}):")
    for ex in s['examples'][:5]:
        print(f"  - {ex}")

## UMAP visualization

In [None]:
import umap

reducer = umap.UMAP(random_state=42)
coords = reducer.fit_transform(embeddings)

fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(coords[:, 0], coords[:, 1], c=labels, cmap='tab20', s=10, alpha=0.6)
ax.set_title("HDBSCAN Clusters of Teaching Moment Descriptions")
plt.colorbar(scatter, label="Cluster ID")
plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / "cluster_umap.png"), dpi=150)
plt.show()

## Multi-signal scoring

In [None]:
from masterclass_experiments.scoring import (
    compute_teacher_frequency,
    compute_muq_predictability,
    select_dimensions,
    PERCEPIANO_MUQ_R2,
)

freq = compute_teacher_frequency(labels)
print("Teacher frequency per cluster:")
for cid, f in sorted(freq.items()):
    print(f"  Cluster {cid}: {f:.1%}")

# Manual step: after reviewing cluster examples, define PercePiano mappings
# This dict maps cluster_id -> list of PercePiano dimension names
# Fill in after reviewing Cell 3 output
PP_MAPPING = {
    # Example (fill in based on cluster review):
    # 0: ["dynamic_range"],
    # 1: ["pedal_amount", "pedal_clarity"],
    # ...
}

muq_scores = compute_muq_predictability(PP_MAPPING, PERCEPIANO_MUQ_R2)

## STOP contribution (requires masterclass feature pipeline)

In [None]:
# Compute STOP AUC delta per cluster.
# For each cluster, compare STOP AUC with vs without that cluster's
# PercePiano proxy dimensions in the feature vector.

from masterclass_experiments.data import load_moments, identify_segments
from masterclass_experiments.features import extract_muq_features, extract_quality_scores
from masterclass_experiments.evaluation import leave_one_video_out_cv

moments = load_moments(DATA_DIR / "all_moments.jsonl")
segments = identify_segments(moments)

# Load pre-computed features
# ... (use existing muq_embeddings from masterclass_cache)
# Compute STOP AUC with full 19-dim quality scores as baseline
# Then for each cluster, zero out its proxy dims and re-compute

## Selection and hierarchy

In [None]:
# Build candidate scores dict and run selection
candidates = {}
for cid in freq:
    candidates[cid] = {
        "frequency": freq[cid],
        "muq_r2": muq_scores.get(cid, 0.0),
        "stop_delta_auc": 0.0,  # Fill from Cell 6
    }

kept, dropped = select_dimensions(candidates)
print(f"\nKept {len(kept)} dimensions, dropped {len(dropped)}")
for cid, scores in kept.items():
    print(f"  Cluster {cid}: freq={scores['frequency']:.1%}, muq_r2={scores['muq_r2']:.3f}")

# Build hierarchy from kept clusters
from masterclass_experiments.scoring import build_hierarchy

kept_dims = []
for cid in kept:
    mask = labels == cid
    centroid = embeddings[mask].mean(axis=0)
    kept_dims.append({"name": f"cluster_{cid}", "centroid": centroid})  # Rename after review

hierarchy = build_hierarchy(kept_dims, n_groups=4)
print("\nHierarchy:")
for group in hierarchy:
    print(f"  {group['group_name']}: {group['dimensions']}")

## Name dimensions and build quote bank

In [None]:
from masterclass_experiments.quote_bank import build_quote_bank

# Manual step: name each kept cluster based on review
# DIMENSION_NAMES = {cluster_id: "human_readable_name", ...}
# Fill in after reviewing clusters

# Build moment -> dimension assignments
open_moments = []
with open(DATA_DIR / "open_moments.jsonl") as f:
    for line in f:
        if line.strip():
            open_moments.append(json.loads(line))

assignments = {}
for mid, label in zip(moment_ids, labels):
    if label >= 0 and label in kept:
        # assignments[mid] = DIMENSION_NAMES[label]
        assignments[mid] = f"dim_{label}"  # Replace with real names

bank = build_quote_bank(open_moments, assignments)
for dim, quotes in bank.items():
    print(f"\n{dim}: {len(quotes)} quotes")
    for q in quotes[:3]:
        print(f"  [{q['severity']}] {q['teacher']}: {q['feedback_summary']}")

## Save taxonomy artifacts

In [None]:
taxonomy_output = {
    "dimensions": {},  # Fill with final named dimensions + descriptions
    "hierarchy": hierarchy,
    "cluster_summary": summary,
    "selection_scores": {str(k): v for k, v in candidates.items()},
}

with open(OUTPUT_DIR / "dimension_definitions.json", "w") as f:
    json.dump(taxonomy_output, f, indent=2)

with open(OUTPUT_DIR / "quote_bank.json", "w") as f:
    json.dump(bank, f, indent=2)

print(f"Saved to {OUTPUT_DIR}")