In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [2]:
import itertools
import os

import datasets
import numpy as np
import pandas as pd

from src.dataset import prepare_ds
from src.train import get_evaluator, get_model
from src.utils import get_csv_name, get_model_name

In [3]:
RES_DIR_PATH = "res"
NOTEBOOK_ENV = "jupyter"

AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")
CSV_PATH = os.path.join(RES_DIR_PATH, "samples.csv")

FEATURES_CONFIG = {"genre": {"top_n": 3, "samples": 1000}}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

# Evaluation

In [4]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": 0.0,
}

In [5]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
filtered_csv_path = get_csv_name(FEATURES_CONFIG, CSV_PATH)
df = pd.read_csv(filtered_csv_path)
df.head()

Unnamed: 0,path,genre,id,duration,split
0,Apple Loops for GarageBand/Spacey Electric Pia...,Electronic/Dance,Apple_Loops_for_GarageBand_Spacey_Electric_Pia...,8.097959,valid
1,Keyboard Collection/Comb The World Clav.mp3,Electronic/Dance,Keyboard_Collection_Comb_The_World_Clav,9.691429,train
2,Jam Pack Remix Tools/Trance Dripper Beat 04.mp3,Electronic/Dance,Jam_Pack_Remix_Tools_Trance_Dripper_Beat_04,1.933061,train
3,Jam Pack Remix Tools/Techno Spokes Beat 01.mp3,Electronic/Dance,Jam_Pack_Remix_Tools_Techno_Spokes_Beat_01,1.776327,train
4,25 Step Sequencer/Sneaky Cycles Bass.mp3,Electronic/Dance,25_Step_Sequencer_Sneaky_Cycles_Bass,7.497143,train


In [6]:
# Create a function for loading the dataset for the requested model

def load_and_prepare_ds(model, df):
    encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{model}-full-encoded")
    ds = datasets.load_from_disk(encoded_dataset_path)
    return prepare_ds(ds, df, FEATURES_CONFIG, fixed_mapping=None, save=False)

## Metrics

In [35]:
combinations = {
    "feature_encoder": ["wav2vec2", "whisper"],
    "freeze_encoder": [True, False],
    "classifier_layers": [[256], [256, 256]], 
}

In [41]:
# Compute metrics for every network in every split of the dataset
stats = {}
ds_type = None
prepared_ds = None

for conf in itertools.product(*combinations.values()):
    TRAINING_CONFIG["feature_encoder"] = conf[0]
    TRAINING_CONFIG["freeze_encoder"] = conf[1]
    TRAINING_CONFIG["classifier_layers"] = conf[2]
    model_name = get_model_name(TRAINING_CONFIG)
    model_path = os.path.join(MODELS_DIR_PATH, model_name)
    TRAINING_CONFIG["model_path"] = model_path
    
    if ds_type != TRAINING_CONFIG["feature_encoder"]:
        ds_type = TRAINING_CONFIG["feature_encoder"]
        prepared_ds = load_and_prepare_ds(ds_type, df)
    
    if os.path.exists(model_path):
        print(f"Loading {model_path} weights")
        
        stats[model_name] = {}
        model = get_model(TRAINING_CONFIG, prepared_ds["train"])
        trainer = get_evaluator(
            model=model,
            training_config=TRAINING_CONFIG,
        )

        for split in ["train", "valid", "test"]:
            outputs = trainer.evaluate(prepared_ds[split])
            
            preds_label = np.array([model.config.id2label[idx] for idx in outputs.label_ids])

            stats[model_name][split] = {
                "loss": outputs.metrics["eval_loss"],
                "acc": outputs.metrics["eval_accuracy"],
                "preds_id": outputs.label_ids,
                "preds_label": preds_label,
            }

Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-wav2vec2-full-encoded/cache-bca765b857065d3e.arrow
Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-wav2vec2-full-encoded/cache-2e979f1eba96989c.arrow
Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-wav2vec2-full-encoded/cache-126cb87b100a69fa.arrow


Removing extra columns from dataset
Extracting train split
Extracting test split
Extracting valid split
Create `ClassLabels` for target classes


Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-whisper-full-encoded/cache-38cb3a9232d39199.arrow
Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-whisper-full-encoded/cache-b67b100035545290.arrow
Loading cached processed dataset at /home/alessandro/Projects/Uni/AII/music-classification/res/datasets/ds-whisper-full-encoded/cache-5c1d727b919b0b51.arrow


Removing extra columns from dataset
Extracting train split
Extracting test split
Extracting valid split
Create `ClassLabels` for target classes
Loading res/models/whisper-frz-c256-d0 weights


loading configuration file config.json from cache at /home/alessandro/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_length"

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: path, id, duration. If path, id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: path, id, duration. If path, id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8
  Batch size = 16


## Wrongly Classified Samples

In [55]:
df_misclasses = []
for model_name in stats.keys():
    for split in stats[model_name].keys():
        preds_label = stats[model_name][split]["preds_label"]
        df_split = df[df["split"] == split]
        df_misclass = df_split[df_split["genre"] != preds_label].reset_index(drop=True)
        print(df_misclass["genre"])

0    Electronic/Dance
1        World/Ethnic
Name: genre, dtype: object
0        World/Ethnic
1    Electronic/Dance
2          Rock/Blues
3    Electronic/Dance
4          Rock/Blues
Name: genre, dtype: object
0          Rock/Blues
1    Electronic/Dance
2    Electronic/Dance
3        World/Ethnic
Name: genre, dtype: object


# Conclusion

The conclusions drawn from the projects are the following:
- Fine-tuning is key, probably due to
- A more elaborated classifier head doesn't improve : the expresiveness of Transformers is fine by itself in 
- The Whisper architecture faster and more accurate

Further analysis should regard more :
- More hyperparameter tuning
- By the ambiguous nature of genre of music
- Heavier regularization
- Data augmentation