## Scale Magnitude Transform for genre/rythmic pattern classification


In [None]:
%load_ext autoreload
%autoreload 2

import mirdata
import librosa
import helpers
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, LeaveOneOut
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay

## Groove Midi

_Jon Gillick, Adam Roberts, Jesse Engel, Douglas Eck, and David Bamman.
"Learning to Groove with Inverse Sequence Transformations."
International Conference on Machine Learning (ICML), 2019._

The Groove MIDI Dataset (GMD) is composed of 13.6 hours of aligned MIDI and (synthesized) audio of human-performed, tempo-aligned expressive drumming. The dataset contains 1,150 MIDI files and over 22,000 measures of drumming.

It could be used to classify fills or beats. Keep in mind that while fills tend to have a short duration (few seconds), beats tend to be longer. Therefore beats should be segmented in shorter chunks?


In [None]:
groove_dataset = mirdata.initialize("groove_midi")
# groove_dataset.download()
groove_dataset.validate()

track.style -> a string style for the performance formatted as “primary/secondary” (e.g. rock/halftime, funk/purdieshuffle). The primary style comes from the Genre List below.

Genre List: afrobeat, afrocuban, blues, country, dance, funk, gospel, highlife, hiphop, jazz, latin, middleeastern, neworleans, pop, punk, reggae, rock, soul

For the following experiment the label will consist of the primary style only.


In [None]:
features = []
labels = []
for _, track in tqdm(groove_dataset.load_tracks().items()):
    if track.beat_type == "fill":
        try:
            y, sr = librosa.load(track.audio_path, sr=8000)
            # stm = helpers.compute_stm(y=y, sr=sr, auto_cor_lag_seconds=2)[:30]
            # features.append(stm)
            features.append(helpers.compute_stm(y=y, sr=sr, win_size=256, hop=128, auto_cor_lag_seconds=5)[:90])
            labels.append(track.style.split("/")[0])
        except Exception as e:
            # exception encoutered with invalid audio_path
            print("Error:", e)
            continue

encoded_labels = LabelEncoder().fit_transform(labels)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    np.array(features), encoded_labels, test_size=0.3, stratify=encoded_labels, random_state=42
)

knn = KNeighborsClassifier(n_neighbors=7, metric="cosine")
# rnn = RadiusNeighborsClassifier(n_neighbors=7, metric="cosine").fit(X=X_train, y=y_train)

k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn, features, encoded_labels, cv=k_fold, scoring="accuracy")

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

knn = KNeighborsClassifier(n_neighbors=7, metric="cosine").fit(X=x_train, y=y_train)
ConfusionMatrixDisplay.from_estimator(knn, x_test, y_test, display_labels=set(labels))
plt.xticks(rotation=90)
plt.show()

## Ballroom


In [7]:
ballroom_dataset = mirdata.initialize("ballroom")
# ballroom_dataset.download()
ballroom_dataset.validate()

100%|██████████| 698/698 [00:08<00:00, 80.14it/s] 
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


({'tracks': {}}, {'tracks': {}})

In [None]:
features = []
labels = []
for _, track in tqdm(ballroom_dataset.load_tracks().items()):
    y, sr = librosa.load(track.audio_path, sr=8000)
    # note that for this dataset, the number of coefs has been set to 170, as reported in the paper
    features.append(helpers.compute_stm(y=y, sr=sr, win_size=256, hop=128, auto_cor_lag_seconds=5)[:90])
    labels.append(track.genre)

encoded_labels = LabelEncoder().fit_transform(labels)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    np.array(features), encoded_labels, test_size=0.3, stratify=encoded_labels, random_state=42
)

knn = KNeighborsClassifier(n_neighbors=7, metric="cosine")
# rnc = RadiusNeighborsClassifier(metric="cosine", radius=20)

loo = LeaveOneOut()
# k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn, features, labels, cv=loo, scoring="accuracy")

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

knn = KNeighborsClassifier(metric="cosine").fit(X=x_train, y=y_train)
# rnc = RadiusNeighborsClassifier(metric="cosine", radius=10).fit(X=x_train, y=y_train)

ConfusionMatrixDisplay.from_estimator(knn, x_test, y_test, display_labels=set(labels))
plt.xticks(rotation=90)
plt.show()

## GridSearch CV on Ballroom

In [8]:
# Define a grid of parameters to search over
param_grid = {
    'n_mels': [50, 75, 100],      # Example values for n_mels
    'auto_cor_lag_seconds': [5, 8, 13],  # Example values for auto_cor_lag_seconds
    'stm_coefs' : [i for i in range(30, 400, 30)]
}

best_score = -1
best_params = None

# Iterate over all combinations of parameters
for n_mels in param_grid['n_mels']:
    for auto_cor_lag_seconds in param_grid['auto_cor_lag_seconds']:
        for stm_coefs in param_grid['stm_coefs']:
            print("n_mels:", n_mels, "auto_cor_lag_seconds:", auto_cor_lag_seconds, "stm_coefs:", stm_coefs)
            # Compute features with current parameters
            features = []
            labels = []
            for _, track in tqdm(ballroom_dataset.load_tracks().items()):
                y, sr = librosa.load(track.audio_path, sr=8000)
                features.append(helpers.compute_stm(y=y, sr=sr, win_size=160, hop=80, n_mels=n_mels, auto_cor_lag_seconds=auto_cor_lag_seconds)[:stm_coefs])
                labels.append(track.genre)

            # Encode labels
            encoded_labels = LabelEncoder().fit_transform(labels)

            # Initialize and evaluate the classifier
            knn = KNeighborsClassifier(n_neighbors=7, metric="cosine")
            loo = LeaveOneOut()
            cv_scores = cross_val_score(knn, features, labels, cv=loo, scoring="accuracy")
            mean_cv_score = cv_scores.mean()
            print("Mean CV Accuracy:", mean_cv_score)

            # Update best score and parameters if needed
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_params = {
                    'n_mels': n_mels,
                    'auto_cor_lag_seconds': auto_cor_lag_seconds
                }

# Output the best parameters and score
print("Best Parameters:", best_params)
print("Best Mean CV Accuracy:", best_score)

n_mels: 50 auto_cor_lag_seconds: 5 stm_coefs: 30


100%|██████████| 698/698 [01:30<00:00,  7.75it/s]


Mean CV Accuracy: 0.7664756446991404
n_mels: 50 auto_cor_lag_seconds: 5 stm_coefs: 60


100%|██████████| 698/698 [01:37<00:00,  7.13it/s]


Mean CV Accuracy: 0.7578796561604585
n_mels: 50 auto_cor_lag_seconds: 5 stm_coefs: 90


100%|██████████| 698/698 [01:28<00:00,  7.86it/s]


Mean CV Accuracy: 0.7607449856733525
n_mels: 50 auto_cor_lag_seconds: 5 stm_coefs: 120


  8%|▊         | 54/698 [00:07<01:23,  7.68it/s]


KeyboardInterrupt: 

## Greek/Cretan Dances


In [None]:
cretan_dances_data_path = Path("../datasets/CretanDances")

features = []
labels = []

for subfolder in cretan_dances_data_path.iterdir():
    if subfolder.is_dir():
        label = subfolder.name
        print(label)
        for audio_file in subfolder.glob("*.wav"):
            y, sr = librosa.load(audio_file, sr=None)
            # note that for this dataset, the number of coefs has been set to 30, as reported in the paper
            features.append(helpers.compute_stm(y=y, sr=sr, win_size=160, hop=160)[:30])
            labels.append(label)

encoded_labels = LabelEncoder().fit_transform(labels)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    np.array(features), encoded_labels, test_size=0.3, stratify=encoded_labels, random_state=42
)

knn = KNeighborsClassifier(n_neighbors=7, metric="cosine")
# rnn = RadiusNeighborsClassifier(n_neighbors=7, metric="cosine").fit(X=X_train, y=y_train)

loo = LeaveOneOut()
# k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn, features, labels, cv=loo, scoring="accuracy")
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

knn = KNeighborsClassifier(n_neighbors=7, metric="cosine").fit(X=x_train, y=y_train)
ConfusionMatrixDisplay.from_estimator(knn, x_test, y_test, display_labels=set(labels))
plt.xticks(rotation=90)
plt.show()

## GridSearch CV on GreekDances

In [None]:
cretan_dances_data_path = Path("../datasets/CretanDances")

# Define a grid of parameters to search over
param_grid = {
    'n_mels': [50, 75, 100],      # Example values for n_mels
    'auto_cor_lag_seconds': [5, 8, 13],  # Example values for auto_cor_lag_seconds
    'stm_coefs' : [i for i in range(30, 400, 30)]
}

best_score = -1
best_params = None

# Iterate over all combinations of parameters
for n_mels in param_grid['n_mels']:
    for auto_cor_lag_seconds in param_grid['auto_cor_lag_seconds']:
        for stm_coefs in param_grid['stm_coefs']:
            print("n_mels:", n_mels, "auto_cor_lag_seconds:", auto_cor_lag_seconds, "stm_coefs:", stm_coefs)
            # Compute features with current parameters
            features = []
            labels = []
            for subfolder in cretan_dances_data_path.iterdir():
                if subfolder.is_dir():
                    label = subfolder.name
                    print(label)
                    for audio_file in subfolder.glob("*.wav"):
                        y, sr = librosa.load(audio_file, sr=8000)
                        features.append(helpers.compute_stm(y=y, sr=sr, win_size=160, hop=80, n_mels=n_mels, auto_cor_lag_seconds=auto_cor_lag_seconds)[:stm_coefs])
                        labels.append(label)


            # Encode labels
            encoded_labels = LabelEncoder().fit_transform(labels)

            # Initialize and evaluate the classifier
            knn = KNeighborsClassifier(n_neighbors=7, metric="cosine")
            loo = LeaveOneOut()
            cv_scores = cross_val_score(knn, features, labels, cv=loo, scoring="accuracy")
            mean_cv_score = cv_scores.mean()
            print("Mean CV Accuracy:", mean_cv_score)

            # Update best score and parameters if needed
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_params = {
                    'n_mels': n_mels,
                    'auto_cor_lag_seconds': auto_cor_lag_seconds
                }

# Output the best parameters and score
print("Best Parameters:", best_params)
print("Best Mean CV Accuracy:", best_score)