# Entropy-Complexity Planes for FMA Classical Subgenres

This notebook computes permutation entropy (PE) and statistical complexity (CE) for FMA classical tracks using four different signal representations:
1. **Amplitude** — Hilbert envelope
2. **Flux** — Spectral flux
3. **Harmony** — CQT chroma → circle-of-fifths angle
4. **Spectral Entropy** — frame-wise spectral entropy

Each entropy-complexity plane shows tracks colored by their Classical subgenre.

In [None]:
import sys, os, ast, warnings, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import ordpy
from tqdm import tqdm

warnings.filterwarnings('ignore')

# Add project root to path so we can import chaos_methods
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.chaos_methods import (
    ordpy_amplitude, ordpy_flux, ordpy_harmony, ordpy_specentropy
)

# Paths
TRACKS_CSV  = os.path.join(PROJECT_ROOT, 'data', 'fma_metadata', 'tracks.csv')
GENRES_CSV  = os.path.join(PROJECT_ROOT, 'data', 'fma_metadata', 'genres.csv')
FMA_SMALL   = os.path.join(PROJECT_ROOT, 'data', 'fma_small')

print(f'Project root: {PROJECT_ROOT}')
print(f'FMA small dir exists: {os.path.isdir(FMA_SMALL)}')

## 1. Load Metadata

In [None]:
# ── Load genres ───────────────────────────────────────────────────────
genres_df = pd.read_csv(GENRES_CSV, index_col=0)
print(f'Total genres: {len(genres_df)}')

# ── Load tracks (complex multi-row header) ────────────────────────────
tracks_raw = pd.read_csv(TRACKS_CSV, header=[0, 1], low_memory=False)

tracks_raw = tracks_raw.iloc[1:]  # skip the row with 'track_id'

flat_cols = [col[1] if col[1] != '' else col[0] for col in tracks_raw.columns]
tracks_raw.columns = flat_cols

first_col = tracks_raw.columns[0]
tracks_raw = tracks_raw.rename(columns={first_col: 'track_id'})
tracks_raw['track_id'] = pd.to_numeric(tracks_raw['track_id'], errors='coerce')
tracks_raw = tracks_raw.dropna(subset=['track_id'])
tracks_raw['track_id'] = tracks_raw['track_id'].astype(int)
tracks_raw = tracks_raw.set_index('track_id')

print(f'Total tracks: {len(tracks_raw)}')
print(f'Columns sample: {list(tracks_raw.columns[:10])}')

## 2. Identify Classical Subgenres and Filter Tracks

In [None]:
CLASSICAL_TOP_LEVEL_ID = 5

# Find all genres that belong to Classical (top_level == 5)
classical_genres = genres_df[genres_df['top_level'] == CLASSICAL_TOP_LEVEL_ID].copy()
classical_genre_ids = set(classical_genres.index.tolist())
classical_genre_names = classical_genres['title'].to_dict()  # {genre_id: genre_title}

print(f'Classical subgenres found: {len(classical_genre_ids)}')
for gid, name in sorted(classical_genre_names.items(), key=lambda x: x[1]):
    print(f'  {gid:>4d}: {name}')

In [None]:
def parse_genre_list(val):
    """Parse the genres_all column which contains Python-style lists like '[21, 12]'."""
    if pd.isna(val) or val == '':
        return []
    try:
        return ast.literal_eval(str(val))
    except:
        return []

def get_track_audio_path(track_id):
    """Construct the audio file path for a given FMA track ID."""
    tid_str = f'{track_id:06d}'
    folder = tid_str[:3]
    return os.path.join(FMA_SMALL, folder, f'{tid_str}.mp3')

def get_most_specific_subgenre(genre_ids, genre_id_set, genre_names, top_level_id):
    """
    From a list of genre IDs for a track, find the most specific subgenre.
    Prefer non-top-level subgenres over the generic top-level genre.
    """
    matches = [gid for gid in genre_ids if gid in genre_id_set]
    if not matches:
        return None, None
    specific = [gid for gid in matches if gid != top_level_id]
    if specific:
        chosen = specific[0]
    else:
        chosen = top_level_id
    return chosen, genre_names.get(chosen, 'Unknown')

# ── Find genres_all column ────────────────────────────────────────────
genres_all_cols = [c for c in tracks_raw.columns if 'genres_all' in str(c)]
print(f'Columns containing genres_all: {genres_all_cols}')
genres_all_col = genres_all_cols[0] if genres_all_cols else 'genres_all'

# ── Filter tracks that are in fma_small and belong to Classical ──────
classical_tracks = []

for track_id in tqdm(tracks_raw.index, desc='Filtering Classical tracks'):
    fpath = get_track_audio_path(track_id)
    if not os.path.isfile(fpath):
        continue
    
    genre_ids = parse_genre_list(tracks_raw.loc[track_id, genres_all_col])
    
    sub_id, sub_name = get_most_specific_subgenre(
        genre_ids, classical_genre_ids, classical_genre_names, CLASSICAL_TOP_LEVEL_ID
    )
    if sub_id is not None:
        classical_tracks.append({
            'track_id': track_id,
            'subgenre_id': sub_id,
            'subgenre': sub_name,
            'file_path': fpath
        })

classical_df = pd.DataFrame(classical_tracks)
print(f'\nTotal Classical tracks in fma_small: {len(classical_df)}')
print(f'\nTracks per subgenre:')
print(classical_df['subgenre'].value_counts().to_string())

## 3. Compute Entropy-Complexity Values

We compute PE and CE for each classical track using all 4 methods. This may take some time.

In [None]:
METHODS = {
    'amplitude':    ordpy_amplitude,
    'flux':         ordpy_flux,
    'harmony':      ordpy_harmony,
    'specentropy':  ordpy_specentropy,
}

DIM = 6
HOP = 1

# Results dict: method_name -> DataFrame with columns [track_id, H, C, subgenre]
results = {}

for method_idx, (method_name, method_func) in enumerate(METHODS.items(), 1):
    print(f'\n{"="*60}')
    print(f'[{method_idx}/{len(METHODS)}] Computing: {method_name} (dim={DIM}, hop={HOP})')
    print(f'{"="*60}')
    
    rows = []
    errors = 0
    t0 = time.time()
    
    pbar = tqdm(total=len(classical_df), desc=f'{method_name}',
                bar_format='{l_bar}{bar:30}{r_bar}',
                file=sys.stdout)
    
    for i, (_, row) in enumerate(classical_df.iterrows()):
        try:
            H, C = method_func(row['file_path'], dim_size=DIM, hop_size=HOP)
            rows.append({
                'track_id': row['track_id'],
                'H': H,
                'C': C,
                'subgenre': row['subgenre'],
                'subgenre_id': row['subgenre_id']
            })
        except Exception as e:
            errors += 1
        
        pbar.update(1)
        pbar.set_postfix(ok=len(rows), err=errors)
    
    pbar.close()
    elapsed = time.time() - t0
    
    df = pd.DataFrame(rows).dropna(subset=['H', 'C'])
    results[method_name] = df
    print(f'  ✓ {len(df)} tracks computed, {errors} errors, {elapsed:.1f}s elapsed')

## 4. Plot Entropy-Complexity Planes

In [None]:
# Get ordpy boundary curves
max_HC = ordpy.maximum_complexity_entropy(DIM, 1)
min_HC = ordpy.minimum_complexity_entropy(DIM, 1)

# Build a consistent colormap across all methods
all_subgenres = sorted(classical_df['subgenre'].unique())
n_subgenres = len(all_subgenres)
cmap = cm.get_cmap('tab20', max(n_subgenres, 20))
if n_subgenres > 20:
    cmap2 = cm.get_cmap('tab20b', n_subgenres - 20)
    colors = [cmap(i) for i in range(20)] + [cmap2(i) for i in range(n_subgenres - 20)]
else:
    colors = [cmap(i) for i in range(n_subgenres)]

subgenre_colors = {sg: colors[i] for i, sg in enumerate(all_subgenres)}

print(f'Color mapping for {n_subgenres} subgenres:')
for sg, color in subgenre_colors.items():
    print(f'  {sg}')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(18, 16))
fig.suptitle('Entropy-Complexity Planes for FMA Classical Subgenres', fontsize=16, fontweight='bold')

method_titles = {
    'amplitude':   'Amplitude Envelope (Hilbert)',
    'flux':        'Spectral Flux',
    'harmony':     'Harmonic (CQT Chroma)',
    'specentropy': 'Spectral Entropy',
}

for idx, (method_name, ax) in enumerate(zip(METHODS.keys(), axes.flat)):
    df = results[method_name]
    
    # Plot boundary curves
    ax.plot(max_HC[:, 0], max_HC[:, 1], 'k--', linewidth=1.5, alpha=0.7, label='Max complexity')
    ax.plot(min_HC[:, 0], min_HC[:, 1], 'k-.', linewidth=1.5, alpha=0.7, label='Min complexity')
    
    # Plot each subgenre with its own color
    for sg in all_subgenres:
        mask = df['subgenre'] == sg
        if mask.sum() == 0:
            continue
        ax.scatter(
            df.loc[mask, 'H'], df.loc[mask, 'C'],
            c=[subgenre_colors[sg]],
            label=f'{sg} ({mask.sum()})',
            s=30, alpha=0.6, edgecolors='none'
        )
    
    ax.set_xlabel('Normalized Permutation Entropy', fontsize=11)
    ax.set_ylabel('Statistical Complexity', fontsize=11)
    ax.set_title(method_titles.get(method_name, method_name), fontsize=13, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, None)
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal', 'box')

# Add a single shared legend at the bottom
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=min(6, len(handles)),
           fontsize=9, bbox_to_anchor=(0.5, -0.02), frameon=True)

plt.tight_layout(rect=[0, 0.05, 1, 0.96])

# Save
out_dir = os.path.join(PROJECT_ROOT, 'plots', 'fma_classical_subgenres')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f'entropy_complexity_classical_subgenres_d{DIM}_h{HOP}.png')
plt.savefig(out_path, dpi=150, bbox_inches='tight')
print(f'Saved combined plot → {out_path}')
plt.show()

## 5. Individual Plots per Method (larger, more readable)

In [None]:
for method_name in METHODS.keys():
    df = results[method_name]
    
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Boundary curves
    ax.plot(max_HC[:, 0], max_HC[:, 1], 'k--', linewidth=1.5, alpha=0.7, label='Max complexity')
    ax.plot(min_HC[:, 0], min_HC[:, 1], 'k-.', linewidth=1.5, alpha=0.7, label='Min complexity')
    
    # Scatter by subgenre
    for sg in all_subgenres:
        mask = df['subgenre'] == sg
        if mask.sum() == 0:
            continue
        ax.scatter(
            df.loc[mask, 'H'], df.loc[mask, 'C'],
            c=[subgenre_colors[sg]],
            label=f'{sg} ({mask.sum()})',
            s=50, alpha=0.65, edgecolors='white', linewidths=0.3
        )
    
    title = method_titles.get(method_name, method_name)
    ax.set_xlabel('Normalized Permutation Entropy', fontsize=13)
    ax.set_ylabel('Statistical Complexity', fontsize=13)
    ax.set_title(f'Entropy-Complexity Plane: {title}\n(FMA Classical subgenres, dim={DIM}, hop={HOP})',
                 fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, None)
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal', 'box')
    ax.legend(loc='upper left', fontsize=8, ncol=2, framealpha=0.9)
    
    plt.tight_layout()
    
    out_path = os.path.join(out_dir, f'EC_{method_name}_classical_d{DIM}_h{HOP}.png')
    plt.savefig(out_path, dpi=150, bbox_inches='tight')
    print(f'Saved → {out_path}')
    plt.show()

## 6. Summary Statistics

In [None]:
for method_name, df in results.items():
    print(f'\n{"="*50}')
    print(f'Method: {method_name}')
    print(f'{"="*50}')
    
    stats = df.groupby('subgenre').agg(
        n_tracks=('H', 'count'),
        H_mean=('H', 'mean'),
        H_std=('H', 'std'),
        C_mean=('C', 'mean'),
        C_std=('C', 'std')
    ).sort_values('n_tracks', ascending=False)
    
    print(stats.to_string(float_format='%.4f'))