In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [2]:
import itertools
import os

import datasets
import numpy as np
import pandas as pd

from src.dataset import prepare_ds, add_audio_column
from src.train import get_evaluator, get_model
from src.utils import get_csv_name, get_model_name

In [3]:
RES_DIR_PATH = "res"
NOTEBOOK_ENV = "jupyter"

AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")
CSV_PATH = os.path.join(RES_DIR_PATH, "samples_clustered.csv")

TOP_N_GENRES = 6
TOP_N_FEATURES = 9

FEATURES_CONFIGS = {
    "subset": {"genre": {"top_n": 3, "samples": 1000}},
    "genre": {"genre": {"top_n": TOP_N_GENRES, "samples": None}},
    "category": {"category": {"top_n": TOP_N_FEATURES, "samples": None}},
}

# Evaluation

In [4]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": 0.0,
}

In [5]:
# Build the filename indicating the subset of the whole dataset with the specific configurations

dfs = {}
for d in ["subset", "category", "genre"]:
    filtered_csv_path = get_csv_name(FEATURES_CONFIGS[d], CSV_PATH)
    dfs[d] = pd.read_csv(filtered_csv_path)

In [6]:
# Create a function for loading the dataset for the requested model

def load_and_prepare_ds(training_config, feature_config, df, clustered=True):
    encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{training_config['feature_encoder']}-full-encoded")
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": training_config['feature_encoder']})
    return prepare_ds(ds, df, feature_config, clustered=clustered, fixed_mapping=None, save=False)

## Metrics

In [8]:
combinations = {
    "feature_encoder": ["wav2vec2", "whisper"],
    "freeze_encoder": [True, False],
    "classifier_layers": [[256], [256, 256]],
    "dataset": ["subset", "genre", "category"]
}

In [None]:
# Compute metrics for every network in every split of the dataset
stats = {}
ds_type = None
ds_encoder = None
prepared_ds = None

for conf in itertools.product(*combinations.values()):
    TRAINING_CONFIG["feature_encoder"] = conf[0]
    TRAINING_CONFIG["freeze_encoder"] = conf[1]
    TRAINING_CONFIG["classifier_layers"] = conf[2]
    dataset = conf[3]

    model_name = get_model_name(TRAINING_CONFIG)
    model_path = os.path.join(MODELS_DIR_PATH, model_name)
    TRAINING_CONFIG["model_path"] = model_path
    
    if ds_encoder != TRAINING_CONFIG["feature_encoder"] or ds_type != dataset:
        ds_encoder = TRAINING_CONFIG["feature_encoder"]
        ds_type = dataset
        prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIGS[dataset], dfs[dataset])
    
    if os.path.exists(model_path):
        print(f"Loading {model_path} weights")
        
        stats[model_name] = {}
        model = get_model(TRAINING_CONFIG, prepared_ds["train"])
        trainer = get_evaluator(
            model=model,
            training_config=TRAINING_CONFIG,
        )

        for split in ["train", "valid", "test"]:
            outputs = trainer.evaluate(prepared_ds[split])
            
            preds_label = np.array([model.config.id2label[idx] for idx in outputs.label_ids])

            stats[model_name][split] = {
                "loss": outputs.metrics["eval_loss"],
                "acc": outputs.metrics["eval_accuracy"],
                "preds_id": outputs.label_ids,
                "preds_label": preds_label,
            }

### Sampled Dataset

| Model    | Fine-tuned   | Classifier   | Split   |   Accuracy |   Loss |
|:---------|:-------------|:-------------|:--------|-----------:|-------:|
| wav2vec2 | No           | c256         | train   |     0.5432 | 0.9376 |
| wav2vec2 | Yes          | c256         | train   |     0.9437 | 0.232  |
| wav2vec2 | Yes          | c256_256     | train   |     0.4781 | 0.9955 |
| whisper  | No           | c256         | train   |     0.5607 | 1.0336 |
| whisper  | Yes          | c256         | train   |     0.9962 | 0.0194 |
| whisper  | Yes          | c256_256     | train   |     0.9975 | 0.0169 |
| wav2vec2 | No           | c256         | valid   |     0.54   | 0.9379 |
| wav2vec2 | Yes          | c256         | valid   |     0.86   | 0.5778 |
| wav2vec2 | Yes          | c256_256     | valid   |     0.55   | 0.9236 |
| whisper  | No           | c256         | valid   |     0.58   | 1.0245 |
| whisper  | Yes          | c256         | valid   |     0.84   | 1.0213 |
| whisper  | Yes          | c256_256     | valid   |     0.89   | 0.6018 |
| wav2vec2 | No           | c256         | test    |     0.54   | 0.9182 |
| wav2vec2 | Yes          | c256         | test    |     0.76   | 0.9124 |
| wav2vec2 | Yes          | c256_256     | test    |     0.46   | 0.9201 |
| whisper  | No           | c256         | test    |     0.63   | 1.0249 |
| whisper  | Yes          | c256         | test    |     0.85   | 0.9321 |
| whisper  | Yes          | c256_256     | test    |     0.81   | 1.244  |

### Genre Classification

| Model   | Fine-tuned   | Classifier   | Split   |   Accuracy |   Loss |
|:--------|:-------------|:-------------|:--------|-----------:|-------:|
| whisper | Yes          | c256         | train   |     0.9458 | 0.194  |
| whisper | Yes          | c256         | valid   |     0.8393 | 0.5841 |
| whisper | Yes          | c256         | test    |     0.83   | 0.646  |

### Category Classification

| Model   | Fine-tuned   | Classifier   | Split   |   Accuracy |   Loss |
|:--------|:-------------|:-------------|:--------|-----------:|-------:|
| whisper | Yes          | c256         | train   |     0.9774 | 0.0938 |
| whisper | Yes          | c256         | valid   |     0.9251 | 0.3386 |
| whisper | Yes          | c256         | test    |     0.9185 | 0.3587 |


# Conclusion

- The encoding backbone is a major factor - Whisper achieves better results with less computational time
- Fine-tuning the models is key for results - Since the pretrained model where based on speech, some training for the encoders on music audios can greatly improve performances 
- Different classification heads don't seem to affect the training much
- Classification on `category` converged much faster than `genre` - This might be because of the ambiguous nature of genres, which may be very clearly identifiable in some edge cases, but almost impossible in some situations

## Potential Improvements

- Different preprocessing of the dataset: `genre` and `category` were grouped arbitrarily and could
- More hyperparameter tuning: 
    - A deeper classification head might bring better performances to frozen backbones architetures
- Heavier regularization: the only source of regularization performed in this work is early stopping
- Other Transformer encoders for text-to-speech may be tried out