In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [2]:
import itertools
import os

import datasets
import numpy as np
import pandas as pd

from src.dataset import prepare_ds, add_audio_column
from src.train import get_evaluator, get_model
from src.utils import get_csv_name, get_model_name

In [3]:
RES_DIR_PATH = "res"
NOTEBOOK_ENV = "jupyter"

AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")
CSV_PATH = os.path.join(RES_DIR_PATH, "samples_clustered.csv")

FEATURES_CONFIG = {"genre": {"top_n": 3, "samples": 1000}}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

# Evaluation

In [4]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": 0.0,
}

In [5]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
filtered_csv_path = get_csv_name(FEATURES_CONFIG, CSV_PATH)
df = pd.read_csv(filtered_csv_path)
df.head()

Unnamed: 0,path,duration,id,genre,split
0,Jam Pack Remix Tools/DnB Minimal Bass.mp3,5.746939,Jam_Pack_Remix_Tools_DnB_Minimal_Bass,Electronic,train
1,Jam Pack Remix Tools/Electro Kicker Beat 02.mp3,3.526531,Jam_Pack_Remix_Tools_Electro_Kicker_Beat_02,Electronic,train
2,07 Chillwave/Filtered Bit Bass.mp3,4.884898,07_Chillwave_Filtered_Bit_Bass,Electronic,train
3,Jam Pack Remix Tools/Garage Friday Organ 03.mp3,3.73551,Jam_Pack_Remix_Tools_Garage_Friday_Organ_03,Electronic,train
4,Jam Pack Remix Tools/2-Step Flux Beat 01.mp3,7.209796,Jam_Pack_Remix_Tools_2-Step_Flux_Beat_01,Electronic,train


In [6]:
# Create a function for loading the dataset for the requested model

def load_and_prepare_ds(training_config, feature_config, df, clustered=True):
    encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{training_config['feature_encoder']}-full-encoded")
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": training_config['feature_encoder']})
    return prepare_ds(ds, df, feature_config, clustered=clustered, fixed_mapping=None, save=False)

## Metrics

In [7]:
combinations = {
    "feature_encoder": ["wav2vec2", "whisper"],
    "freeze_encoder": [True, False],
    "classifier_layers": [[256], [256, 256]], 
}

In [8]:
# Compute metrics for every network in every split of the dataset
stats = {}
ds_type = None
prepared_ds = None

for conf in itertools.product(*combinations.values()):
    TRAINING_CONFIG["feature_encoder"] = conf[0]
    TRAINING_CONFIG["freeze_encoder"] = conf[1]
    TRAINING_CONFIG["classifier_layers"] = conf[2]
    model_name = get_model_name(TRAINING_CONFIG)
    model_path = os.path.join(MODELS_DIR_PATH, model_name)
    print("looking for", model_path)
    TRAINING_CONFIG["model_path"] = model_path
    
    if ds_type != TRAINING_CONFIG["feature_encoder"]:
        ds_type = TRAINING_CONFIG["feature_encoder"]
        prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG, df)
    
    if os.path.exists(model_path):
        print(f"Loading {model_path} weights")
        
        stats[model_name] = {}
        model = get_model(TRAINING_CONFIG, prepared_ds["train"])
        trainer = get_evaluator(
            model=model,
            training_config=TRAINING_CONFIG,
        )

        for split in ["train", "valid", "test"]:
            outputs = trainer.evaluate(prepared_ds[split])
            
            preds_label = np.array([model.config.id2label[idx] for idx in outputs.label_ids])

            stats[model_name][split] = {
                "loss": outputs.metrics["eval_loss"],
                "acc": outputs.metrics["eval_accuracy"],
                "preds_id": outputs.label_ids,
                "preds_label": preds_label,
            }

looking for res/models/wav2vec2-frz-c256-d0
Removing extra columns from dataset
Mapping features clusters




Map:   0%|          | 0/20636 [00:00<?, ? examples/s]

Extracting train split
Extracting valid split
Extracting test split
Create `ClassLabels` for target classes
{'genre': ClassLabel(names=['Electronic', 'Rock/Blues', 'World/Ethnic'], id=None)}


Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading res/models/wav2vec2-frz-c256-d0 weights


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 799
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mastockman[0m. Use [1m`wandb login --relogin`[0m to force relogin


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


looking for res/models/wav2vec2-frz-c256_256-d0
looking for res/models/wav2vec2-fnt-c256-d0
Loading res/models/wav2vec2-fnt-c256-d0 weights


loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


looking for res/models/wav2vec2-fnt-c256_256-d0
Loading res/models/wav2vec2-fnt-c256_256-d0 weights


loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


looking for res/models/whisper-frz-c256-d0


loading configuration file preprocessor_config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/302560528ac75a251232980ebcc68bad9668f664/preprocessor_config.json
Feature extractor WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "mel_filters": [
    [
      -0.0,
      0.02486259490251541,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0

Removing extra columns from dataset
Mapping features clusters
Extracting train split
Extracting valid split
Extracting test split
Create `ClassLabels` for target classes
{'genre': ClassLabel(names=['Electronic', 'Rock/Blues', 'World/Ethnic'], id=None)}


Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Loading res/models/whisper-frz-c256-d0 weights


loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/302560528ac75a251232980ebcc68bad9668f664/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


looking for res/models/whisper-frz-c256_256-d0
looking for res/models/whisper-fnt-c256-d0
Loading res/models/whisper-fnt-c256-d0 weights


loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/302560528ac75a251232980ebcc68bad9668f664/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


looking for res/models/whisper-fnt-c256_256-d0
Loading res/models/whisper-fnt-c256_256-d0 weights


loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/302560528ac75a251232980ebcc68bad9668f664/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: id, duration. If id, duration are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


In [9]:
for model_name, split in stats.items():
    print("="*50)
    print(model_name)

    for split_name, detail in split.items():
        print("-"*50)
        print(split_name)
        print(f'Loss: {detail["loss"]}')
        print(f'Accuracy: {detail["acc"]}')

wav2vec2-frz-c256-d0
--------------------------------------------------
train
Loss: 0.9375913739204407
Accuracy: 0.5431789737171464
--------------------------------------------------
valid
Loss: 0.9378825426101685
Accuracy: 0.54
--------------------------------------------------
test
Loss: 0.9182183146476746
Accuracy: 0.54
wav2vec2-fnt-c256-d0
--------------------------------------------------
train
Loss: 0.23199911415576935
Accuracy: 0.9436795994993742
--------------------------------------------------
valid
Loss: 0.5777797698974609
Accuracy: 0.86
--------------------------------------------------
test
Loss: 0.9123861789703369
Accuracy: 0.76
wav2vec2-fnt-c256_256-d0
--------------------------------------------------
train
Loss: 0.9954535365104675
Accuracy: 0.4780976220275344
--------------------------------------------------
valid
Loss: 0.9236339330673218
Accuracy: 0.55
--------------------------------------------------
test
Loss: 0.9200836420059204
Accuracy: 0.46
whisper-frz-c256-d0


## Wrongly Classified Samples

In [55]:
df_misclasses = []
for model_name in stats.keys():
    for split in stats[model_name].keys():
        preds_label = stats[model_name][split]["preds_label"]
        df_split = df[df["split"] == split]
        df_misclass = df_split[df_split["genre"] != preds_label].reset_index(drop=True)
        print(df_misclass["genre"])

0    Electronic/Dance
1        World/Ethnic
Name: genre, dtype: object
0        World/Ethnic
1    Electronic/Dance
2          Rock/Blues
3    Electronic/Dance
4          Rock/Blues
Name: genre, dtype: object
0          Rock/Blues
1    Electronic/Dance
2    Electronic/Dance
3        World/Ethnic
Name: genre, dtype: object


# Conclusion

The conclusions drawn from the projects are the following:
- Fine-tuning is key, probably due to
- A more elaborated classifier head doesn't improve : the expresiveness of Transformers is fine by itself in 
- The Whisper architecture faster and more accurate

Further analysis should regard more :
- More hyperparameter tuning
- By the ambiguous nature of genre of music
- Heavier regularization
- Data augmentation