In [1]:
import re
import pandas as pd
from pathlib import Path
from typing import Tuple, Dict, List

In [2]:
DESCRIPTORS = [
    "mean", "std", "min", "max", "q25", "q75", "skew", "kurtosis",
    "range", "iqr", "cv", "median", "mad", "variation", "trend"
]

In [3]:
def base_of(col: str) -> str:
    """Strip trailing _<descriptor> from a column to recover the base feature."""
    parts = col.rsplit('_', 1)
    # Check if the last part is a known descriptor
    if len(parts) == 2 and parts[1] in DESCRIPTORS:
        return parts[0]
    return col # if no descriptor suffix, return as-is

In [4]:
# Ordered list of (regex, level1, level2) mapping rules.
RULES: List[Tuple[re.Pattern, str, str]] = [
    # --- Harmony / Pitch & Harmonicity ---
    (re.compile(r"^F0final_sma"), "Harmony", "Pitch"),
    (re.compile(r"^voicingFinalUnclipped_sma"), "Prosodic/Voice", "Voicing"),
    (re.compile(r"^logHNR_sma"), "Harmony", "Harmonicity"),
    (re.compile(r"^pcm_fftMag_spectralHarmonicity_sma"), "Harmony", "Harmonicity"),

    # --- Prosodic/Voice quality ---
    (re.compile(r"^jitter(Local|DDP)_sma"), "Prosodic/Voice", "Jitter"),
    (re.compile(r"^shimmerLocal_sma"), "Prosodic/Voice", "Shimmer"),

    # --- Timbre — spectral shape / sharpness ---
    (re.compile(r"^pcm_fftMag_spectralCentroid_sma"), "Timbre", "SpectralShape"),
    (re.compile(r"^pcm_fftMag_spectral(RollOff25\.0|RollOff50\.0|RollOff75\.0|RollOff90\.0)_sma"), "Timbre", "SpectralShape"),
    (re.compile(r"^pcm_fftMag_psySharpness_sma"), "Timbre", "Sharpness"),
    (re.compile(r"^pcm_fftMag_spectral(Entropy|Variance)_sma"), "Timbre", "Complexity"),
    (re.compile(r"^pcm_fftMag_spectral(Skewness|Kurtosis)_sma"), "Timbre", "Texture"),

    # --- MFCCs (split into sub-groups) ---
    (re.compile(r"^pcm_fftMag_mfcc_sma\[(?:1|2|3|4)\]"), "Timbre", "MFCC_Formant"),
    (re.compile(r"^pcm_fftMag_mfcc_sma\[(?:5|6|7|8)\]"), "Timbre", "MFCC_Spectral"),
    (re.compile(r"^pcm_fftMag_mfcc_sma\[(?:9|1[0-4])\]"), "Timbre", "MFCC_Texture"),

    # --- Dynamics / Energy ---
    (re.compile(r"^pcm_RMSenergy_sma"), "Dynamics", "Energy"),
    (re.compile(r"^audspec_lengthL1norm_sma"), "Dynamics", "Loudness"),
    (re.compile(r"^audspecRasta_lengthL1norm_sma"), "Dynamics", "Loudness"),

    # --- Rhythm / Temporal change ---
    (re.compile(r"^pcm_zcr_sma"), "Rhythm", "Temporal"),
    (re.compile(r"^pcm_fftMag_spectralFlux_sma"), "Rhythm", "Temporal"),

    # --- Tonal balance bands ---
    (re.compile(r"^pcm_fftMag_fband250-650_sma"), "Timbre", "Band_LowMid"),
    (re.compile(r"^pcm_fftMag_fband1000-4000_sma"), "Timbre", "Band_MidHigh"),
]

In [5]:
def apply_rules(base_name: str) -> Tuple[str, str]:
    for pat, lvl1, lvl2 in RULES:
        if pat.search(base_name):
            return lvl1, lvl2
    return "Other", "Unmapped"

In [6]:
def build_hierarchy_map(
    agg_df: pd.DataFrame,
    out_csv: Path = Path("config/hierarchy_map.csv"),
    report_csv: Path = Path("config/hierarchy_unmapped.csv"),
) -> pd.DataFrame:
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    # Recover the unique base feature names from aggregated columns
    bases = sorted({base_of(c) for c in agg_df.columns})

    rows = []
    unmapped = []
    for base in bases:
        lvl1, lvl2 = apply_rules(base)
        # 3. CRITICAL FIX: The map must contain the BASE feature name, not the full aggregated name.
        rows.append({"feature_name": base, "level1": lvl1, "level2": lvl2})
        if (lvl1, lvl2) == ("Other", "Unmapped"):
            unmapped.append(base)

    df_map = pd.DataFrame(rows)
    df_map.to_csv(out_csv, index=False)

    if unmapped:
        pd.DataFrame({"base_feature_unmapped": sorted(unmapped)}).to_csv(report_csv, index=False)

    # Console summary
    total = len(bases)
    n_unmapped = len(unmapped)
    print(f"[hierarchy_map] bases: {total} | unmapped: {n_unmapped} ({n_unmapped/total:.1%})")
    print(f"Saved: {out_csv}")
    if n_unmapped:
        print(f"Review unmapped bases in: {report_csv}")

    return df_map

In [7]:
agg_path = Path("../data/processed_features/DEAM/raw_aggregated_features.csv") 
agg = pd.read_csv(agg_path)
build_hierarchy_map(agg)

[hierarchy_map] bases: 261 | unmapped: 137 (52.5%)
Saved: config/hierarchy_map.csv
Review unmapped bases in: config/hierarchy_unmapped.csv


Unnamed: 0,feature_name,level1,level2
0,F0final_sma_amean,Harmony,Pitch
1,F0final_sma_de_amean,Harmony,Pitch
2,F0final_sma_de_stddev,Harmony,Pitch
3,F0final_sma_stddev,Harmony,Pitch
4,audSpec_Rfilt_sma[0]_amean,Other,Unmapped
...,...,...,...
256,song_id,Other,Unmapped
257,voicingFinalUnclipped_sma_amean,Prosodic/Voice,Voicing
258,voicingFinalUnclipped_sma_de_amean,Prosodic/Voice,Voicing
259,voicingFinalUnclipped_sma_de_stddev,Prosodic/Voice,Voicing
