In [120]:
import sys
import os
import pandas as pd
from pathlib import Path

sys.path.append('..')
# import toolkits from src
from src.make_dataset.deam_loader import * 
from src.make_dataset.lyric_utils import LyricsFetcher

from pathlib import Path

ANNOTATIONS_PATH = Path("../data/raw/DEAM_Annotations/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv")
METADATA_DIR = Path("../data/raw/metadata_DEAM/metadata")
TOKEN_FILE = Path("../secrets.txt")
CACHE_FILE = Path("../data/processed/DEAM/lyrics_cache.json")
OUTPUT_CORE = Path("../data/processed/core_dataset.parquet")

In [276]:
from importlib import reload
from src import analysis_utils
from src.make_dataset import deam_loader
from src import aggregate
from src.make_dataset import split_data

reload(analysis_utils)
reload(deam_loader)
reload(aggregate)
reload(split_data)
from src.analysis_utils import *
from src.make_dataset.deam_loader import *

### Create the base dataset (song id, artist name, track name, V-A mean and std columns)

In [2]:
# merge the different csvs together and the metadata on the matching song_id column
print("Step 1: Creating base dataset ")
base_df = create_deam_base_dataset(
    annotations_path=ANNOTATIONS_PATH,
    metadata_dir=METADATA_DIR
)

Step 1: Creating base dataset 
Loaded 58 test songs
Created dataset with 1802 songs
  Train/val (≤2000): 1744
  Test (>2000): 58


In [3]:
base_df.to_parquet(OUTPUT_CORE)
base_df.to_csv("../data/processed/core_dataset.csv")

### Add lyrics

In [4]:
# Initialize the lyrics fetcher and enrich the base dataset
print("\nStep 2: Enriching with lyrics")
token = TOKEN_FILE.read_text().strip()
fetcher = LyricsFetcher(genius_api_token=token, cache_path=CACHE_FILE)
final_df = fetcher.enrich_dataframe(base_df, batch_save_size=50)

2025-08-28 05:26:20,530 - INFO - LyricsFetcher initialized with fuzzy matching threshold of 85.
2025-08-28 05:26:20,531 - INFO - Found 1802 songs needing lyrics.



Step 2: Enriching with lyrics


Fetching Lyrics:   0%|          | 0/1802 [00:00<?, ?it/s]

2025-08-28 05:26:20,588 - INFO - Lyrics fetching complete. Final cache saved.
2025-08-28 05:26:20,589 - INFO - Found lyrics for 241 out of 1802 songs.


In [5]:
# Save the dataset with lyrics
print("\nStep 3: Saving final dataset")
OUTPUT_LYRIC = Path("../data/processed/core_dataset_lyrics.parquet")
OUTPUT_LYRIC.parent.mkdir(parents=True, exist_ok=True)
final_df.to_parquet(OUTPUT_LYRIC, index=False)
print(f"Successfully saved final dataset to {OUTPUT_LYRIC}")


Step 3: Saving final dataset
Successfully saved final dataset to ../data/processed/core_dataset_lyrics.parquet


#### Lyrics fetched with the Genius API via the lyricsgenius library
#### Each (artist, track) was normalized (lowercasing, removing punctuation, trimming "feat." and parentheses)
#### Multiple name variants were tried for robust matching (e.g. "and" vs &, shortened artist names)
#### Fuzzy string matching (rapidfuzz) was used when exact matches were not reliable (Threshold = 85)
#### two-pass search strategy: first with artist and title, then title only with post-validation
#### Retrieved lyrics were cleaned: removed ads, "embed", contributor info, or section headers. Short/empty results were discarded
#### Cache file (lyrics_cache.json) ensured songs were not refetched and progress could resume if interrupted and avoid repeated API calls

In [6]:
print("\nFinal DataFrame Info:")
final_df.info()
display(final_df.head())


Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802 entries, 0 to 1801
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   song_id       1802 non-null   int64  
 1   track_name    1802 non-null   object 
 2   artist_name   1802 non-null   object 
 3   valence_mean  1802 non-null   float64
 4   arousal_mean  1802 non-null   float64
 5   valence_std   1802 non-null   float64
 6   arousal_std   1802 non-null   float64
 7   lyrics        241 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 112.8+ KB


Unnamed: 0,song_id,track_name,artist_name,valence_mean,arousal_mean,valence_std,arousal_std,lyrics
0,2,Tonight A Lonely Century,The New Mystikal Troubadours,3.1,3.0,0.94,0.63,
1,3,DD Groove,Kevin MacLeod,3.5,3.3,1.75,1.62,
2,4,Slow Burn,Kevin MacLeod,5.7,5.5,1.42,1.63,
3,5,Nothing Much,My Bubba & Mi,4.4,5.3,2.01,1.85,
4,7,Hustle,Kevin MacLeod,5.8,6.4,1.47,1.69,


### Save the final lyric dataset
#### manual changes: addition of lyrics to niche songs from bandcamp and other less known lyric sources
#### listening to each song that has lyrics to check for matching lyrics (deleting ones that didn't match)
#### translating non-english songs with chatGPT4o with the prompt to preserve emotional conveyance

In [7]:
import json
# Load data + curated cache
final_df = pd.read_parquet(OUTPUT_LYRIC)
with open(CACHE_FILE, "r", encoding="utf-8") as f:
    cache = json.load(f)

# Build keys exactly like the lyrics_cache file: "artist|track" lowercased
keys = final_df["artist_name"].str.lower() + "|" + final_df["track_name"].str.lower()

# Overwrite lyrics from cache to update the manual changes(NaN if no match)
final_df["lyrics"] = keys.map(cache)

final_df.to_parquet(OUTPUT_LYRIC, index=False)

# csv with utf-8-sig for proper display in Excel:
final_df.to_csv(OUTPUT_LYRIC.with_suffix(".csv"), index=False, encoding="utf-8-sig")

print(f"Updated lyrics saved to: {OUTPUT_LYRIC} and {OUTPUT_LYRIC.with_suffix('.csv')}")

Updated lyrics saved to: ../data/processed/core_dataset_lyrics.parquet and ../data/processed/core_dataset_lyrics.csv


In [8]:
print("\nFinal DataFrame Info:")
final_df.info()
display(final_df.head())


Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802 entries, 0 to 1801
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   song_id       1802 non-null   int64  
 1   track_name    1802 non-null   object 
 2   artist_name   1802 non-null   object 
 3   valence_mean  1802 non-null   float64
 4   arousal_mean  1802 non-null   float64
 5   valence_std   1802 non-null   float64
 6   arousal_std   1802 non-null   float64
 7   lyrics        241 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 112.8+ KB


Unnamed: 0,song_id,track_name,artist_name,valence_mean,arousal_mean,valence_std,arousal_std,lyrics
0,2,Tonight A Lonely Century,The New Mystikal Troubadours,3.1,3.0,0.94,0.63,
1,3,DD Groove,Kevin MacLeod,3.5,3.3,1.75,1.62,
2,4,Slow Burn,Kevin MacLeod,5.7,5.5,1.42,1.63,
3,5,Nothing Much,My Bubba & Mi,4.4,5.3,2.01,1.85,
4,7,Hustle,Kevin MacLeod,5.8,6.4,1.47,1.69,


### Train/Val/Test splits

In [201]:
from src.make_dataset.split_data import *
core_df = pd.read_parquet(OUTPUT_CORE)
core_df = core_df.set_index('song_id')

In [58]:
# original split creation
tr, va, te = create_original_split(core_df, test_song_id_start=2001, val_size=0.15, random_state=42)
analyze_split(core_df, tr, va, te, "Original")


--- Original ---
Total: 1802
Train: 1488 (82.6%) | Val: 256 (14.2%) | Test: 58 (3.2%)
valence_mean: train μ=4.891, val μ=4.975, test μ=4.924
arousal_mean: train μ=4.806, val μ=4.850, test μ=4.857
Artist overlap | T∩V: 0, T∩Te: 0, V∩Te: 0


In [59]:
save_splits_triplet(tr, va, te, out_dir="../data/splits", name="original",
                    meta={"test_song_id_start": 2001, "val_size": 0.15, "random_state": 42})

In [60]:
# Custom split to ~70/15/15
tr2, va2, te2 = create_augmented_split(core_df, target_train=0.70, target_val=0.15, target_test=0.15,
                                       test_song_id_start=2001, random_state=42)
analyze_split(core_df, tr2, va2, te2, "Augmented 70/15/15")


--- Augmented 70/15/15 ---
Total: 1802
Train: 1257 (69.8%) | Val: 276 (15.3%) | Test: 269 (14.9%)
valence_mean: train μ=4.901, val μ=4.958, test μ=4.863
arousal_mean: train μ=4.818, val μ=4.838, test μ=4.769
Artist overlap | T∩V: 0, T∩Te: 0, V∩Te: 0


In [277]:
tr2, va2, te2 = load_splits_triplet()

In [278]:
save_splits_triplet(tr2, va2, te2, out_dir="../data/splits", name="custom",
                    meta={"targets": [0.70, 0.15, 0.15], "test_song_id_start": 2001, "random_state": 42})

In [288]:
# Reusable K-folds
dev_ids = tr2.union(va2) # train+val = development set (custom split)
core_dev = core_df.loc[dev_ids].copy()
folds_dev = create_kfold_splits(core_dev, n_splits=5, random_state=42) # don't leak test set

save_kfold_splits(
    folds_dev, out_dir="../data/splits", name="cv5",
    meta={"built_on": "custom_train+val", "n_splits": 3, "q": 3, "random_state": 42}
)

In [289]:
from src.make_dataset.split_data import load_kfold_splits
# 1) New folds don’t touch test
cv5_splits = load_kfold_splits()
for i, (tr, va) in enumerate(cv5_splits):
    assert tr.isin(te2).sum() == 0 and va.isin(te2).sum() == 0, f"Fold {i} leaks test!"

# 2) Group leakage (artists) across splits
def _artists(ids): return set(core_df.loc[ids, 'artist_name'].dropna())
art_dev = _artists(dev_ids); art_te = _artists(te2)
print("Artist overlap from dev to test:", len(art_dev & art_te))

Artist overlap from dev to test: 0


In [290]:
print(cv5_splits[0])
print(cv5_splits[1])
print(cv5_splits[2])

(Index([   2,    3,    4,    7,    8,   10,   12,   18,   19,   20,
       ...
       1987, 1988, 1991, 1992, 1993, 1994, 1995, 1997, 1998, 2000],
      dtype='int64', name='song_id', length=1231), Index([  35,   44,   45,   62,   76,   79,   87,  102,  103,  105,
       ...
       1936, 1940, 1942, 1952, 1958, 1962, 1963, 1971, 1972, 1996],
      dtype='int64', name='song_id', length=302))
(Index([   2,    3,    4,    7,    8,   10,   12,   18,   19,   21,
       ...
       1983, 1984, 1985, 1986, 1987, 1991, 1993, 1994, 1995, 1996],
      dtype='int64', name='song_id', length=1209), Index([  20,   39,   47,   50,   69,   72,   78,   82,   83,   85,
       ...
       1946, 1947, 1948, 1951, 1964, 1988, 1992, 1997, 1998, 2000],
      dtype='int64', name='song_id', length=324))
(Index([   3,    4,    7,    8,   10,   12,   19,   20,   21,   24,
       ...
       1985, 1986, 1988, 1992, 1994, 1995, 1996, 1997, 1998, 2000],
      dtype='int64', name='song_id', length=1206), Index([   2,  

In [291]:
def check_fold_bins(core_dev, splits, q=3, label_cols=("valence_mean","arousal_mean")):
    df = core_dev[[*label_cols]].copy()
    df["bins"] = (pd.qcut(df[label_cols[0]], q=q, labels=False, duplicates='drop').astype('Int64').astype(str)
                  + "_" +
                  pd.qcut(df[label_cols[1]], q=q, labels=False, duplicates='drop').astype('Int64').astype(str))
    all_bins = sorted(df["bins"].dropna().unique())
    bad = []
    for i, (_, va) in enumerate(splits):
        vc = df.loc[va, "bins"].value_counts()
        missing = [b for b in all_bins if vc.get(b, 0) == 0]
        if missing:
            bad.append((i, missing))
    return bad  # list of (fold_id, missing_bins)

In [292]:
check_fold_bins(core_dev, cv5_splits, q=3)

[]

In [293]:
check_fold_bins(core_dev, cv5_splits, q=5) # quantize on 3 std bins for proper V-A coverage

[(0, ['4_0']), (1, ['3_0']), (3, ['0_4'])]

### Add features to different dataset configurations and create hierarchy map

Making 8 different possible dataset configurations:
1. A dataset that contains the full 260 features and their 8 core stat descriptors (min, max, q25, q75, mean, std, kurtosis, skew) - 2080 total features (260*8)
2. A dataset that contains the full 260 features and a rich 15 descriptor set that also includes trend, range, variation, median, etc. - 3900 total features (260*15)
3. The 2nd dataset, but with decorrelation applied within each perceptual group - 2029
4. The 1st dataset, but with decorrelation applied within each perceptual group - 1226
5. 2nd dataset with global PCA applied (95% variance) - 2 PCAs
6. 1st dataset with global PCA applied (95% variance) - 3 PCAs
7. 2nd dataset, with PCA inside each perceptual group (95% variance) - 15 PCAs
8. 1st dataset, with PCA inside each perceptual group (95% variance) - 48 PCAs
- Because PCA is scale sensitive, the features used for datasets 5-8 will be only the custom split datasets, to reduce branching complexity of later experiments (have to fit the scalers + PCA on train split only before transforming all (train/val/test), and if both the original and custom splits are chosen for this, it's going to result in double the datasets considered - the original split will be evaluated anyway based on 20 different models to get compared with the custom split)

In [124]:
FEATURES_PATH = Path("../data/raw/features")
OUTPUT_PATH = Path("../data/processed")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [125]:
features_2080, failed_ids = create_features_2080(FEATURES_PATH)
print(f"Successfully processed: {len(features_2080)} songs")
print(f"Failed: {len(failed_ids)} songs")

Aggregating features:   0%|          | 0/1802 [00:00<?, ?it/s]

Successfully processed: 1802 songs
Failed: 0 songs


In [126]:
#Save a big set with all aggregated features and their rows(1802 rows x 260*8 columns (all features with suffix their descriptors))
features_2080.to_parquet(OUTPUT_PATH / "features_2080.parquet")

In [127]:
features_2080 = pd.read_parquet(OUTPUT_PATH / "features_2080.parquet")

In [128]:
from src.aggregate import *
print(f"Base dataset shape: {features_2080.shape}")
print(f"First 5 columns: {list(features_2080.columns[:5])}")
print(f"Descriptor distribution: {descriptor_distribution(features_2080.iloc[0])}")

Base dataset shape: (1802, 2080)
First 5 columns: ['F0final_sma_stddev_mean', 'F0final_sma_stddev_std', 'F0final_sma_stddev_min', 'F0final_sma_stddev_max', 'F0final_sma_stddev_q25']
Descriptor distribution: {'mean': 260, 'std': 260, 'min': 260, 'max': 260, 'q25': 260, 'q75': 260, 'skew': 260, 'kurtosis': 260}


In [129]:
# Test alignment with labels
labels = pd.read_parquet("../data/processed/core_dataset.parquet")
labels = labels.set_index('song_id')
        
common_ids = features_2080.index.intersection(labels.index)
print(f"Songs in features: {len(features_2080)}")
print(f"Songs in labels: {len(labels)}")
print(f"Common songs: {len(common_ids)}")
print(f"Test songs in common: {(common_ids > 2000).sum()}")

Songs in features: 1802
Songs in labels: 1802
Common songs: 1802
Test songs in common: 58


In [149]:
# Build hierarchy map from aggregated features
hierarchy = build_hierarchy_map(features_2080)

32 unmapped features saved to ../results/unmapped_features.csv

=== Feature Hierarchy Mapping Summary ===
Total features: 260
Mapped features: 228 (87.7%)
Unmapped features: 32 (12.3%)

=== Perceptual Dimension Distribution ===
  melodiousness       :  16 cores
  articulation        :  60 cores
  rhythmic_stability  :  18 cores
  rhythmic_complexity :  34 cores
  dissonance          :  48 cores
  tonal_stability     :  52 cores
  unmapped            :  32 cores


In [136]:
hierarchy.to_csv("../data/hierarchy_map.csv", index=False)

In [137]:
# Dataset 1: Keep 8 core descriptors and all features (baseline)
features_2080.to_parquet("../data/processed/features_2080.parquet")

print(f"Dataset 1 - Full with 8 core descriptors")
print(f"Shape: {features_2080.shape}")

Dataset 1 - Full with 8 core descriptors
Shape: (1802, 2080)


In [138]:
# Dataset 2: save one that includes all of the available descriptors
from src.aggregate import FULL
features_3900, failed_ids = create_features_2080(FEATURES_PATH, descriptor_set=FULL)
print(f"Successfully processed: {len(features_3900)} songs")
print(f"Failed: {len(failed_ids)} songs")

Aggregating features:   0%|          | 0/1802 [00:00<?, ?it/s]

Successfully processed: 1802 songs
Failed: 0 songs


In [139]:
features_3900.to_parquet(OUTPUT_PATH / "features_3900.parquet")
print(f"Dataset 2 - Full with 15 descriptors")
print(f"Shape: {features_3900.shape}")

Dataset 2 - Full with 15 descriptors
Shape: (1802, 3900)


In [140]:
features_3900 = pd.read_parquet(OUTPUT_PATH / "features_3900.parquet")

In [150]:
VALID_CATS = ['melodiousness','articulation','rhythmic_stability',
              'rhythmic_complexity','dissonance','tonal_stability']
hierarchy = hierarchy[hierarchy['perceptual'].isin(VALID_CATS)].copy() # exclude unmapped features from the reducted datasets

In [184]:
# 1) ensure no unmapped remains
assert not (hierarchy['perceptual'] == 'unmapped').any()

# 2) confirm categories present after your filter
print(hierarchy['perceptual'].value_counts())

# 3) confirm mapping keys match the function you use for lookups
# if this prints zero, you were mixing base/core names
keys_are_core = sum(hierarchy['feature'].str.contains(r'_(?:amean|stddev)$'))
print("core-like keys in map:", keys_are_core)

perceptual
articulation           60
tonal_stability        52
dissonance             48
rhythmic_complexity    34
rhythmic_stability     18
melodiousness          16
Name: count, dtype: int64
core-like keys in map: 228


In [186]:
# Dataset 3: Remove highly correlated features within each perceptual group from the core 2080
features_decorr_2080 = remove_correlated_within_groups(features_2080, hierarchy, threshold=0.95)
features_decorr_2080.to_parquet("../data/processed/features_decorrelated_2080.parquet")

print(f"Dataset 3 - Decorrelated within groups(from 2080 set)")
print(f"Shape: {features_decorr_2080.shape}")
print(f"Removed {features_2080.shape[1] - features_decorr_2080.shape[1]} correlated features")

# Show remaining features per group
mapping = {row['feature']: row['perceptual'] for _, row in hierarchy.iterrows()}
for group in ['melodiousness', 'articulation', 'rhythmic_stability', 
              'rhythmic_complexity', 'dissonance', 'tonal_stability']:
    group_cols = [col for col in features_decorr_2080.columns 
                  if mapping.get(core_of(col)) == group]
    print(f"  {group:20s}: {len(group_cols):4d} features remain")

Removing 823 correlated features
Dataset 3 - Decorrelated within groups(from 2080 set)
Shape: (1802, 1257)
Removed 823 correlated features
  melodiousness       :   52 features remain
  articulation        :  243 features remain
  rhythmic_stability  :   82 features remain
  rhythmic_complexity :  129 features remain
  dissonance          :  184 features remain
  tonal_stability     :  311 features remain


In [188]:
# Dataset 4: Remove highly correlated features within each perceptual group
features_decorr_3900 = remove_correlated_within_groups(features_3900, hierarchy, threshold=0.95)
features_decorr_3900.to_parquet("../data/processed/features_decorrelated_3900.parquet")

print(f"Dataset 4 - Decorrelated within groups (from 3900 set)")
print(f"Shape: {features_decorr_3900.shape}")
print(f"Removed {features_3900.shape[1] - features_decorr_3900.shape[1]} correlated features")

# Show remaining features per group
mapping = {row['feature']: row['perceptual'] for _, row in hierarchy.iterrows()}
for group in ['melodiousness', 'articulation', 'rhythmic_stability', 
              'rhythmic_complexity', 'dissonance', 'tonal_stability']:
    group_cols = [col for col in features_decorr_3900.columns 
                  if mapping.get(core_of_full(col)) == group]
    print(f"  {group:20s}: {len(group_cols):4d} features remain")

Removing 1762 correlated features
Dataset 4 - Decorrelated within groups (from 3900 set)
Shape: (1802, 2138)
Removed 1762 correlated features
  melodiousness       :   87 features remain
  articulation        :  396 features remain
  rhythmic_stability  :  138 features remain
  rhythmic_complexity :  223 features remain
  dissonance          :  314 features remain
  tonal_stability     :  500 features remain


In [189]:
# Dataset 5: PCA within each perceptual group on core 2080 features(preserves interpretability)
features_pca_grouped_2080, pca_models = pca_per_group(features_decorr_2080, hierarchy, train_ids=tr2, variance_explained=0.95)
features_pca_grouped_2080.to_parquet("../data/processed/features_pca_per_group_2080.parquet")

print(f"Dataset 5 - PCA per group on the 2080 set")
print(f"Shape: {features_pca_grouped_2080.shape}")

# Show PCs per group
print("\nPrincipal components per group:")
for group in ['melodiousness', 'articulation', 'rhythmic_stability',
              'rhythmic_complexity', 'dissonance', 'tonal_stability']:
    group_pcs = [col for col in features_pca_grouped_2080.columns if col.startswith(group)]
    if group_pcs:
        print(f"  {group:20s}: {len(group_pcs):2d} PCs")

# Save PCA models for interpretation
with open("../data/pca_pickles/pca_per_group_models_2080.pkl", 'wb') as f:
    pickle.dump(pca_models, f)

Dataset 5 - PCA per group on the 2080 set
Shape: (1802, 347)

Principal components per group:
  melodiousness       : 24 PCs
  articulation        : 83 PCs
  rhythmic_stability  : 31 PCs
  rhythmic_complexity : 41 PCs
  dissonance          : 56 PCs
  tonal_stability     : 112 PCs


In [190]:
# Dataset 6: PCA within each perceptual group on full 3900 features(preserves interpretability)
features_pca_grouped_3900, pca_models = pca_per_group(features_decorr_3900, hierarchy, train_ids=tr2, variance_explained=0.95)
features_pca_grouped_3900.to_parquet("../data/processed/features_pca_per_group_3900.parquet")

print(f"Dataset 6 - PCA per group on the 3900 set")
print(f"Shape: {features_pca_grouped_3900.shape}")

# Show PCs per group
print("\nPrincipal components per group:")
for group in ['melodiousness', 'articulation', 'rhythmic_stability',
              'rhythmic_complexity', 'dissonance', 'tonal_stability']:
    group_pcs = [col for col in features_pca_grouped_3900.columns if col.startswith(group)]
    if group_pcs:
        print(f"  {group:20s}: {len(group_pcs):2d} PCs")

# Save PCA models for interpretation
with open("../data/pca_pickles/pca_per_group_models_3900.pkl", 'wb') as f:
    pickle.dump(pca_models, f)

Dataset 6 - PCA per group on the 3900 set
Shape: (1802, 516)

Principal components per group:
  melodiousness       : 32 PCs
  articulation        : 125 PCs
  rhythmic_stability  : 45 PCs
  rhythmic_complexity : 66 PCs
  dissonance          : 87 PCs
  tonal_stability     : 161 PCs


In [194]:
# Dataset 7: Global PCA on full feature set (2080), only on train_ids (custom split)
scaler_2080 = StandardScaler().fit(features_2080.loc[tr2])
# scale on train only
X_train_scaled = scaler_2080.transform(features_2080.loc[tr2])
X_all_scaled   = scaler_2080.transform(features_2080)
# PCA on scaled train - 95% var
pca_global_2080 = PCA(n_components=0.95, random_state=42).fit(X_train_scaled)

X_all_pca_2080 = pca_global_2080.transform(X_all_scaled)
features_pca_global_2080 = pd.DataFrame(
    X_all_pca_2080,
    index=features_2080.index,
    columns=[f"global_PC{i+1}" for i in range(pca_global_2080.n_components_)]
)
features_pca_global_2080.to_parquet("../data/processed/features_pca_global_2080.parquet")

print(f"Dataset 7 - Global PCA on 2080 features with 95% variance retainment")
print(f"Shape: {features_pca_global_2080.shape}")
print(f"Explained variance: {pca_global_2080.explained_variance_ratio_.sum():.3f}")
print(f"First 10 PCs explain: {pca_global_2080.explained_variance_ratio_[:10].sum():.3f}")

# Save PCA model for later use
import pickle
with open("../data/pca_pickles/pca_global_model_2080.pkl", 'wb') as f:
    pickle.dump(pca_global_2080, f)

Dataset 7 - Global PCA on 2080 features with 95% variance retainment
Shape: (1802, 242)
Explained variance: 0.950
First 10 PCs explain: 0.585


In [195]:
# Dataset 8: Global PCA on full feature set (3900), only on train_ids (custom split)
from sklearn.decomposition import PCA

scaler_3900 = StandardScaler().fit(features_3900.loc[tr2])
# scale on train only
X_train_scaled = scaler_3900.transform(features_3900.loc[tr2])
X_all_scaled   = scaler_3900.transform(features_3900)
# PCA on scaled train - 95% var
pca_global_3900 = PCA(n_components=0.95, random_state=42).fit(X_train_scaled)

X_all_pca_3900 = pca_global_3900.transform(X_all_scaled)
features_pca_global_3900 = pd.DataFrame(
    X_all_pca_3900,
    index=features_3900.index,
    columns=[f"global_PC{i+1}" for i in range(pca_global_3900.n_components_)]
)
features_pca_global_3900.to_parquet("../data/processed/features_pca_global_3900.parquet")

print(f"Dataset 8 - Global PCA on 3900 features with 95% variance retainment")
print(f"Shape: {features_pca_global_3900.shape}")
print(f"Explained variance: {pca_global_3900.explained_variance_ratio_.sum():.3f}")
print(f"First 10 PCs explain: {pca_global_3900.explained_variance_ratio_[:10].sum():.3f}")

# Save PCA model for later use
import pickle
with open("../data/pca_pickles/pca_global_model_3900.pkl", 'wb') as f:
    pickle.dump(pca_global_3900, f)

Dataset 8 - Global PCA on 3900 features with 95% variance retainment
Shape: (1802, 317)
Explained variance: 0.950
First 10 PCs explain: 0.551


In [196]:
# check that all 8 datasets align with labels
for path in ['features_2080.parquet', 'features_3900.parquet', 
             'features_decorrelated_3900.parquet', 'features_decorrelated_2080.parquet',
             'features_pca_global_3900.parquet', 'features_pca_global_2080.parquet',
             'features_pca_per_group_3900.parquet', 'features_pca_per_group_2080.parquet']:
    df = pd.read_parquet(f"../data/processed/{path}")
    common = df.index.intersection(labels.index)
    assert len(common) == 1802, f"Alignment issue with {path}"
    
print("All datasets properly aligned with labels")

All datasets properly aligned with labels


### Create a deep learning version of the dataset (for RNN sequential modelling instead of stat descriptors derived from rows)

In [32]:
from importlib import reload
from src.make_dataset import deam_loader

reload(deam_loader)
from src.make_dataset.deam_loader import *

In [33]:
# sub-sample 90 rows only from each song to have a fair input size across songs
rnn_array, rnn_ids = create_rnn_dataset(FEATURES_PATH)

### calculate key, mode and bpm and create a separate dataframe with them

In [20]:
# Setup Paths
core_df = pd.read_parquet("../data/processed/core_dataset.parquet")
AUDIO_DIR = Path("../data/raw/audio_files_DEAM/MEMD_audio")
OUTPUT_PATH = Path("../data/processed/audio_metadata.parquet")

In [21]:
from importlib import reload
from src.make_dataset import bpm_key_mode

reload(bpm_key_mode)

<module 'src.make_dataset.bpm_key_mode' from '/home/georgios/PGMP/notebooks/../src/make_dataset/bpm_key_mode.py'>

In [22]:
import librosa
from tqdm import tqdm
from src.make_dataset.bpm_key_mode import *

# Get Song IDs
all_song_ids = core_df['song_id'].unique()

# Process Audio Files
records = []
for song_id in tqdm(all_song_ids, desc="Estimating BPM, Key, and Mode"):
    audio_file = AUDIO_DIR / f"{song_id}.mp3"
    if not audio_file.exists():
        continue
        
    key, mode, confidence = estimate_key_with_confidence(audio_file)
    bpm = estimate_bpm(audio_file)
    
    records.append({'song_id': song_id, 'key': key, 'mode': mode, 'key_confidence': confidence, 'bpm': bpm})

Estimating BPM, Key, and Mode: 100%|████████████████████████████████████████████████| 1802/1802 [46:43<00:00,  1.56s/it]


In [23]:
# Create and Save DataFrame
metadata_df = pd.DataFrame(records)
metadata_df.to_parquet(OUTPUT_PATH, index=False)

print(f"Metadata DataFrame created with {len(metadata_df)} records.")
print(metadata_df.head())

Metadata DataFrame created with 1802 records.
   song_id key   mode  key_confidence     bpm
0        2   A  minor        0.259305  143.55
1        3   E  minor        0.706457   95.70
2        4   A  minor        0.983184   86.13
3        5   B  major        0.997981   99.38
4        7   C  major        0.778527  117.45


## map creation vol_2

Second attempt at mapping the features into 3 categories.

In [35]:
import pandas as pd

df = pd.read_csv('../data/openSMILE_Perceptual_mapping__draft_.csv')

filtered_df = df[df['Confidence'].isin(['High', 'Medium'])]

result_df = filtered_df[['Acoustic feature', 'Proposed perceptual category']]

# Print the selected columns side-by-side
print("Acoustic Feature                    | Proposed Perceptual Category")
print("--------------------------------------------------------------------")
#print 1 of each
seen = set()
for index, row in result_df.iterrows():
    if row['Proposed perceptual category'] in seen:
        continue
    print(f"{row['Acoustic feature']:<35} | {row['Proposed perceptual category']}")
    seen.add(row['Proposed perceptual category'])

Acoustic Feature                    | Proposed Perceptual Category
--------------------------------------------------------------------
F0final_sma_amean                   | melodiousness
F0final_sma_de_stddev               | tonal_stability
audSpec_Rfilt_sma_de[0]_amean       | rhythmic_complexity
audSpec_Rfilt_sma_de[10]_amean      | articulation
logHNR_sma_de_stddev                | dissonance
pcm_RMSenergy_sma_amean             | rhythmic_stability


In [None]:
for index, row in result_df.iterrows():
    print(f"{row['Acoustic feature']:<35} | {row['Proposed perceptual category']}")

In [36]:
# Count the occurrences of each unique value in the specified column
perceptual_counts = filtered_df['Proposed perceptual category'].value_counts()

print("Sum of each unique perceptual category ('High' & 'Medium' Confidence):\n")
print(perceptual_counts)

Sum of each unique perceptual category ('High' & 'Medium' Confidence):

Proposed perceptual category
articulation           48
rhythmic_complexity    30
dissonance             16
tonal_stability         8
melodiousness           4
rhythmic_stability      2
Name: count, dtype: int64


In [51]:
# build the new hierarchy
hierarchy = build_hierarchy_map(features_2080)
hierarchy.to_csv("../data/hierarchy_map.csv", index=False)

152 unmapped features saved to ../results/unmapped_features.csv

=== Feature Hierarchy Mapping Summary ===
Total features: 260
Mapped features: 108 (41.5%)
Unmapped features: 152 (58.5%)

=== Perceptual Dimension Distribution ===
  articulation        :  48 cores
  rhythmic_complexity :  30 cores
  dissonance          :  16 cores
  tonal_stability     :   8 cores
  melodiousness       :   4 cores
  rhythmic_stability  :   2 cores
  minorness           :   0 cores
  unmapped            : 152 cores


#### add mode as minorness for both feature sets and update the mapping to include it

In [74]:
mode_df = pd.read_parquet("../data/processed/audio_metadata.parquet")
mode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802 entries, 0 to 1801
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   song_id         1802 non-null   int64  
 1   key             1802 non-null   object 
 2   mode            1802 non-null   object 
 3   key_confidence  1802 non-null   float64
 4   bpm             1802 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 70.5+ KB


In [75]:
# Add minorness_score to features. Mode: 'major'/'minor' or 0/1
m = mode_df[['song_id', 'mode', 'key_confidence']].copy()
    
# Convert mode to binary (1=minor, 0=major)
m['is_minor'] = (m['mode'].str.lower() == 'minor').astype(int)

# Calculate minorness
m['minorness_score'] = m['is_minor'] * m['key_confidence']

In [76]:
m.head()

Unnamed: 0,song_id,mode,key_confidence,is_minor,minorness_score
0,2,minor,0.259305,1,0.259305
1,3,minor,0.706457,1,0.706457
2,4,minor,0.983184,1,0.983184
3,5,major,0.997981,0,0.0
4,7,major,0.778527,0,0.0


In [77]:
# Set song_id as index in m before merging
m = m.set_index('song_id')
# Merge
features_2080 = features_2080.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')

In [79]:
len(features_2080.columns)

2081

In [85]:
features_3900 = features_3900.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')

In [86]:
len(features_3900.columns)

3901

In [110]:
# go up and re-run the cells that create the new decorrelated and pca datasets, then re-run all experiment notebooks

In [115]:
print(features_decorr_2080.shape, features_decorr_3900.shape)
print(features_pca_grouped_2080.shape, features_pca_grouped_3900.shape)
print(features_pca_global_2080.shape, features_pca_global_3900.shape)

(1802, 1692) (1802, 3512)
(1802, 187) (1802, 187)
(1802, 243) (1802, 318)


The correlation reduction and the pca reduction both seem to reduce in very similar ways:
- with pca per group being obvious that it preserves the same amount of features
- decorr - 1800 to 1692 - 6% reduction | 3900 to 3512 - 10% reduction
- pca global preserves only 13.5% of the initial dataset size for the 2080, and 8.1% for the 3900 one

In [119]:
# add mode to the newly created datasets too.

In [117]:
features_decorr_2080 = features_decorr_2080.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')
features_decorr_3900 = features_decorr_3900.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')
features_pca_grouped_2080 = features_pca_grouped_2080.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')
features_pca_grouped_3900 = features_pca_grouped_3900.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')
features_pca_global_2080 = features_pca_global_2080.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')
features_pca_global_3900 = features_pca_global_3900.merge(m[['minorness_score']], left_index=True, right_index=True, how='left')

In [118]:
print(features_decorr_2080.shape, features_decorr_3900.shape)
print(features_pca_grouped_2080.shape, features_pca_grouped_3900.shape)
print(features_pca_global_2080.shape, features_pca_global_3900.shape)

(1802, 1693) (1802, 3513)
(1802, 188) (1802, 188)
(1802, 244) (1802, 319)
