# Entropy-Complexity Planes — All FMA Genres Combined

This notebook computes PE and CE for **all** tracks in `fma_small`, labeling each track by its **top-level genre** (e.g. Post-Rock → Rock, Contemporary Classical → Classical).

Methods: Amplitude, Flux, Harmony, Spectral Entropy.

In [1]:
import sys, os, ast, warnings, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import ordpy
from tqdm import tqdm

warnings.filterwarnings('ignore')

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.chaos_methods import (
    ordpy_amplitude, ordpy_flux, ordpy_harmony, ordpy_specentropy
)

TRACKS_CSV = os.path.join(PROJECT_ROOT, 'data', 'fma_metadata', 'tracks.csv')
GENRES_CSV = os.path.join(PROJECT_ROOT, 'data', 'fma_metadata', 'genres.csv')
FMA_SMALL  = os.path.join(PROJECT_ROOT, 'data', 'fma_small')

print(f'Project root: {PROJECT_ROOT}')
print(f'FMA small dir exists: {os.path.isdir(FMA_SMALL)}')

Project root: /Users/mverzhbitskiy/Documents/GitHub/3rdCourseWork
FMA small dir exists: True


## 1. Load Metadata

In [2]:
# ── Load genres ───────────────────────────────────────────────────────
genres_df = pd.read_csv(GENRES_CSV, index_col=0)
print(f'Total genres: {len(genres_df)}')

# Build mapping: genre_id → top-level genre title
# Each genre has a 'top_level' field pointing to its root genre
top_level_names = genres_df.loc[genres_df.index == genres_df['top_level'], 'title'].to_dict()
genre_to_toplevel_name = {}
for gid, row in genres_df.iterrows():
    tl_id = row['top_level']
    genre_to_toplevel_name[gid] = top_level_names.get(tl_id, f'Unknown({tl_id})')

print(f'\nTop-level genres ({len(top_level_names)}):')
for gid, name in sorted(top_level_names.items(), key=lambda x: x[1]):
    print(f'  {gid:>4d}: {name}')

# ── Load tracks ────────────────────────────────────────────────────────
tracks_raw = pd.read_csv(TRACKS_CSV, header=[0, 1], low_memory=False)
tracks_raw = tracks_raw.iloc[1:]

flat_cols = [col[1] if col[1] != '' else col[0] for col in tracks_raw.columns]
tracks_raw.columns = flat_cols

first_col = tracks_raw.columns[0]
tracks_raw = tracks_raw.rename(columns={first_col: 'track_id'})
tracks_raw['track_id'] = pd.to_numeric(tracks_raw['track_id'], errors='coerce')
tracks_raw = tracks_raw.dropna(subset=['track_id'])
tracks_raw['track_id'] = tracks_raw['track_id'].astype(int)
tracks_raw = tracks_raw.set_index('track_id')

print(f'\nTotal tracks in CSV: {len(tracks_raw)}')

Total genres: 163

Top-level genres (16):
     3: Blues
     5: Classical
     9: Country
    13: Easy Listening
    15: Electronic
    38: Experimental
    17: Folk
    21: Hip-Hop
  1235: Instrumental
     2: International
     4: Jazz
     8: Old-Time / Historic
    10: Pop
    12: Rock
    14: Soul-RnB
    20: Spoken

Total tracks in CSV: 106574


## 2. Map All Tracks to Top-Level Genres

In [3]:
def parse_genre_list(val):
    if pd.isna(val) or val == '':
        return []
    try:
        return ast.literal_eval(str(val))
    except:
        return []

def get_track_audio_path(track_id):
    tid_str = f'{track_id:06d}'
    return os.path.join(FMA_SMALL, tid_str[:3], f'{tid_str}.mp3')

def get_top_level_genre(genre_ids, genre_to_toplevel_name):
    """Return the top-level genre name for a track's genre list.
    Uses the first genre in the list to determine the top-level genre."""
    for gid in genre_ids:
        name = genre_to_toplevel_name.get(gid)
        if name:
            return name
    return None

# Find genres_all column
genres_all_cols = [c for c in tracks_raw.columns if 'genres_all' in str(c)]
genres_all_col = genres_all_cols[0] if genres_all_cols else 'genres_all'

# ── Build track list with top-level genre labels ──────────────────────
all_tracks = []

for track_id in tqdm(tracks_raw.index, desc='Mapping tracks to top-level genres'):
    fpath = get_track_audio_path(track_id)
    if not os.path.isfile(fpath):
        continue
    
    genre_ids = parse_genre_list(tracks_raw.loc[track_id, genres_all_col])
    top_genre = get_top_level_genre(genre_ids, genre_to_toplevel_name)
    
    if top_genre is not None:
        all_tracks.append({
            'track_id': track_id,
            'genre': top_genre,
            'file_path': fpath
        })

all_df = pd.DataFrame(all_tracks)
print(f'\nTotal tracks in fma_small with genre labels: {len(all_df)}')
print(f'\nTracks per top-level genre:')
print(all_df['genre'].value_counts().to_string())

Mapping tracks to top-level genres: 100%|██████████| 106574/106574 [00:00<00:00, 143704.73it/s]


Total tracks in fma_small with genre labels: 8000

Tracks per top-level genre:
genre
Hip-Hop          1000
Pop              1000
Folk             1000
Experimental     1000
Rock             1000
International    1000
Electronic       1000
Instrumental     1000





## 3. Compute Entropy-Complexity Values

In [4]:
METHODS = {
    'amplitude':    ordpy_amplitude,
    'flux':         ordpy_flux,
    'harmony':      ordpy_harmony,
    'specentropy':  ordpy_specentropy,
}

DIM = 6
HOP = 1

results = {}

for method_idx, (method_name, method_func) in enumerate(METHODS.items(), 1):
    print(f'\n{"="*60}')
    print(f'[{method_idx}/{len(METHODS)}] Computing: {method_name} (dim={DIM}, hop={HOP})')
    print(f'{"="*60}')
    
    rows = []
    errors = 0
    t0 = time.time()
    
    pbar = tqdm(total=len(all_df), desc=f'{method_name}',
                bar_format='{l_bar}{bar:30}{r_bar}',
                file=sys.stdout)
    
    for i, (_, row) in enumerate(all_df.iterrows()):
        try:
            H, C = method_func(row['file_path'], dim_size=DIM, hop_size=HOP)
            rows.append({
                'track_id': row['track_id'],
                'H': H,
                'C': C,
                'genre': row['genre']
            })
        except Exception as e:
            errors += 1
        
        pbar.update(1)
        pbar.set_postfix(ok=len(rows), err=errors)
    
    pbar.close()
    elapsed = time.time() - t0
    
    df = pd.DataFrame(rows).dropna(subset=['H', 'C'])
    results[method_name] = df
    print(f'  ✓ {len(df)} tracks computed, {errors} errors, {elapsed:.1f}s elapsed')


[1/4] Computing: amplitude (dim=6, hop=1)
amplitude:   6%|█▊                            | 491/8000 [01:19<19:49,  6.31it/s, err=0, ok=491]

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


amplitude:  11%|███▍                          | 902/8000 [02:26<19:39,  6.02it/s, err=0, ok=902]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


amplitude:  15%|████▍                         | 1182/8000 [03:11<18:29,  6.14it/s, err=0, ok=1182]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


amplitude:  28%|████████▍                     | 2266/8000 [06:08<15:54,  6.01it/s, err=0, ok=2266]

[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3360) too large for available bit count (3240)


amplitude:  28%|████████▌                     | 2269/8000 [06:09<11:09,  8.56it/s, err=0, ok=2269]

[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3328) too large for available bit count (3240)


amplitude:  55%|████████████████▌             | 4427/8000 [11:54<05:02, 11.81it/s, err=3, ok=4424]

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


amplitude:  56%|████████████████▊             | 4472/8000 [12:01<07:12,  8.15it/s, err=4, ok=4468]



amplitude:  61%|██████████████████▍           | 4905/8000 [13:10<06:18,  8.17it/s, err=5, ok=4900]



amplitude:  87%|██████████████████████████▏   | 6967/8000 [18:36<02:05,  8.20it/s, err=6, ok=6961]



amplitude: 100%|██████████████████████████████| 8000/8000 [21:20<00:00,  6.25it/s, err=6, ok=7994]
  ✓ 7994 tracks computed, 6 errors, 1280.1s elapsed

[2/4] Computing: flux (dim=6, hop=1)
flux:   6%|█▊                            | 492/8000 [00:36<08:51, 14.13it/s, err=0, ok=492]

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


flux:  11%|███▍                          | 903/8000 [01:06<08:42, 13.59it/s, err=0, ok=903]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


flux:  15%|████▍                         | 1183/8000 [01:26<08:22, 13.57it/s, err=0, ok=1183]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


flux:  28%|████████▌                     | 2269/8000 [02:48<06:27, 14.79it/s, err=0, ok=2269]

[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3328) too large for available bit count (3240)


flux:  55%|████████████████▌             | 4428/8000 [05:36<03:14, 18.38it/s, err=3, ok=4425]

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


flux:  56%|████████████████▊             | 4473/8000 [05:39<04:08, 14.21it/s, err=4, ok=4469]



flux:  61%|██████████████████▍           | 4906/8000 [06:14<03:32, 14.58it/s, err=5, ok=4901]



flux:  87%|██████████████████████████▏   | 6968/8000 [08:56<01:10, 14.64it/s, err=6, ok=6962]



flux: 100%|██████████████████████████████| 8000/8000 [10:14<00:00, 13.01it/s, err=6, ok=7994]
  ✓ 7994 tracks computed, 6 errors, 614.8s elapsed

[3/4] Computing: harmony (dim=6, hop=1)
harmony:   6%|█▊                            | 492/8000 [00:42<10:25, 12.00it/s, err=0, ok=492]

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


harmony:  11%|███▍                          | 903/8000 [01:17<10:02, 11.78it/s, err=0, ok=903]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


harmony:  15%|████▍                         | 1183/8000 [01:41<09:26, 12.04it/s, err=0, ok=1183]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


harmony:  28%|████████▌                     | 2268/8000 [03:14<07:36, 12.56it/s, err=0, ok=2268]

[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3328) too large for available bit count (3240)


harmony:  55%|████████████████▌             | 4428/8000 [06:19<03:29, 17.09it/s, err=3, ok=4425]

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


harmony:  56%|████████████████▊             | 4473/8000 [06:23<04:31, 13.00it/s, err=4, ok=4469]



harmony:  61%|██████████████████▍           | 4906/8000 [07:00<03:51, 13.36it/s, err=5, ok=4901]



harmony:  87%|██████████████████████████▏   | 6968/8000 [09:59<01:18, 13.20it/s, err=6, ok=6962]



harmony: 100%|██████████████████████████████| 8000/8000 [11:27<00:00, 11.63it/s, err=6, ok=7994]
  ✓ 7994 tracks computed, 6 errors, 687.8s elapsed

[4/4] Computing: specentropy (dim=6, hop=1)
specentropy:   6%|█▊                            | 490/8000 [04:20<1:12:36,  1.72it/s, err=0, ok=490]

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


specentropy:  11%|███▍                          | 901/8000 [08:00<1:06:26,  1.78it/s, err=0, ok=901]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


specentropy:  15%|████▍                         | 1181/8000 [10:33<1:07:58,  1.67it/s, err=0, ok=1181]

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


specentropy:  24%|███████▏                      | 1918/8000 [19:01<1:01:25,  1.65it/s, err=0, ok=1918]

KeyboardInterrupt: 

## 4. Plot Entropy-Complexity Planes (All Genres)

In [None]:
# Boundary curves
max_HC = ordpy.maximum_complexity_entropy(DIM, 1)
min_HC = ordpy.minimum_complexity_entropy(DIM, 1)

# Color mapping for top-level genres
all_genres = sorted(all_df['genre'].unique())
n_genres = len(all_genres)

# Use a combination of colormaps for good visual separation
if n_genres <= 10:
    cmap = cm.get_cmap('tab10', 10)
    colors = [cmap(i) for i in range(n_genres)]
elif n_genres <= 20:
    cmap = cm.get_cmap('tab20', 20)
    colors = [cmap(i) for i in range(n_genres)]
else:
    cmap = cm.get_cmap('tab20', 20)
    cmap2 = cm.get_cmap('tab20b', 20)
    colors = [cmap(i) for i in range(20)] + [cmap2(i) for i in range(n_genres - 20)]

genre_colors = {g: colors[i] for i, g in enumerate(all_genres)}

print(f'{n_genres} top-level genres:')
for g in all_genres:
    print(f'  {g}')

In [None]:
# ── Combined 2×2 plot ─────────────────────────────────────────────────
fig, axes = plt.subplots(2, 2, figsize=(20, 18))
fig.suptitle('Entropy-Complexity Planes — All FMA Genres (top-level labels)',
             fontsize=16, fontweight='bold')

method_titles = {
    'amplitude':   'Amplitude Envelope (Hilbert)',
    'flux':        'Spectral Flux',
    'harmony':     'Harmonic (CQT Chroma)',
    'specentropy': 'Spectral Entropy',
}

for idx, (method_name, ax) in enumerate(zip(METHODS.keys(), axes.flat)):
    df = results[method_name]
    
    ax.plot(max_HC[:, 0], max_HC[:, 1], 'k--', linewidth=1.5, alpha=0.7, label='Max complexity')
    ax.plot(min_HC[:, 0], min_HC[:, 1], 'k-.', linewidth=1.5, alpha=0.7, label='Min complexity')
    
    for g in all_genres:
        mask = df['genre'] == g
        if mask.sum() == 0:
            continue
        ax.scatter(
            df.loc[mask, 'H'], df.loc[mask, 'C'],
            c=[genre_colors[g]],
            label=f'{g} ({mask.sum()})',
            s=20, alpha=0.5, edgecolors='none'
        )
    
    ax.set_xlabel('Normalized Permutation Entropy', fontsize=11)
    ax.set_ylabel('Statistical Complexity', fontsize=11)
    ax.set_title(method_titles.get(method_name, method_name), fontsize=13, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, None)
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal', 'box')

handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=min(6, len(handles)),
           fontsize=9, bbox_to_anchor=(0.5, -0.02), frameon=True)

plt.tight_layout(rect=[0, 0.06, 1, 0.96])

out_dir = os.path.join(PROJECT_ROOT, 'plots', 'fma_all_genres')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f'entropy_complexity_all_genres_d{DIM}_h{HOP}.png')
plt.savefig(out_path, dpi=150, bbox_inches='tight')
print(f'Saved combined plot → {out_path}')
plt.show()

## 5. Individual Plots per Method

In [None]:
for method_name in METHODS.keys():
    df = results[method_name]
    fig, ax = plt.subplots(figsize=(14, 11))
    
    ax.plot(max_HC[:, 0], max_HC[:, 1], 'k--', linewidth=1.5, alpha=0.7, label='Max complexity')
    ax.plot(min_HC[:, 0], min_HC[:, 1], 'k-.', linewidth=1.5, alpha=0.7, label='Min complexity')
    
    for g in all_genres:
        mask = df['genre'] == g
        if mask.sum() == 0:
            continue
        ax.scatter(
            df.loc[mask, 'H'], df.loc[mask, 'C'],
            c=[genre_colors[g]],
            label=f'{g} ({mask.sum()})',
            s=40, alpha=0.55, edgecolors='white', linewidths=0.2
        )
    
    title = method_titles.get(method_name, method_name)
    ax.set_xlabel('Normalized Permutation Entropy', fontsize=13)
    ax.set_ylabel('Statistical Complexity', fontsize=13)
    ax.set_title(f'Entropy-Complexity Plane: {title}\n(All FMA genres, dim={DIM}, hop={HOP})',
                 fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, None)
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal', 'box')
    ax.legend(loc='upper left', fontsize=8, ncol=2, framealpha=0.9)
    plt.tight_layout()
    
    out_path = os.path.join(out_dir, f'EC_{method_name}_all_genres_d{DIM}_h{HOP}.png')
    plt.savefig(out_path, dpi=150, bbox_inches='tight')
    print(f'Saved → {out_path}')
    plt.show()

## 6. Summary Statistics

In [None]:
for method_name, df in results.items():
    print(f'\n{"="*60}')
    print(f'Method: {method_name}')
    print(f'{"="*60}')
    
    stats = df.groupby('genre').agg(
        n_tracks=('H', 'count'),
        H_mean=('H', 'mean'),
        H_std=('H', 'std'),
        C_mean=('C', 'mean'),
        C_std=('C', 'std')
    ).sort_values('n_tracks', ascending=False)
    
    print(stats.to_string(float_format='%.4f'))