In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [2]:
import os

import datasets
import pandas as pd

from src.dataset import add_audio_column, filter_df, prepare_ds, split_df
from src.train import end_training, get_model, get_trainer
from src.utils import get_csv_name, get_run_name

In [3]:
RES_DIR_PATH = "res"
NOTEBOOK_ENV = "jupyter"

AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")

CSV_PATH = os.path.join(RES_DIR_PATH, "samples_clustered.csv")

TOP_N_GENRES = 6
TOP_N_FEATURES = 9

FEATURES_CONFIG_SUBSET = {"genre": {"top_n": 3, "samples": 1000}}
FEATURES_CONFIG_GEN = {"genre": {"top_n": TOP_N_GENRES, "samples": None}}
FEATURES_CONFIG_CAT = {"category": {"top_n": TOP_N_FEATURES, "samples": None}}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

# Preparation

In [None]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": None,
}

In [None]:
def create_or_load_df(features_config):
    filtered_csv_path = get_csv_name(features_config, CSV_PATH)

    # If the subset is already in the filesystem, load it directly
    if os.path.exists(filtered_csv_path):
        print(f"Loading {filtered_csv_path}")
        df = pd.read_csv(filtered_csv_path)
    else:
        df = pd.read_csv(CSV_PATH)
        # Filter the dataset according to the given configuration and remove rows containing null values
        df = filter_df(
            df, 
            remove_nones=True,
            features_config=features_config, 
        )
        df.to_csv(filtered_csv_path, index=False)

    print(f"{len(df)} examples in DataFrame")
    # If the split column is not in the dataset, split the dataset into three partisions using 
    # `TEST_SIZE` and `VALID_SIZE` and save the result

    if "split" not in df.columns:
        df = split_df(df, validation_size=VALID_SIZE, test_size=TEST_SIZE)
        df.to_csv(filtered_csv_path, index=False)

    print(df.value_counts("split"))
    return df

In [None]:
# Create a function for loading the dataset for the requested model

def load_and_prepare_ds(training_config, feature_config, df, clustered=True):
    encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{training_config['feature_encoder']}-full-encoded")
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": training_config['feature_encoder']})
    return prepare_ds(ds, df, feature_config, clustered=clustered, fixed_mapping=None, save=False)

## Training Variations
- Backbones: [Wav2Vec2](https://arxiv.org/abs/2006.11477) and [Whisper](https://cdn.openai.com/papers/whisper.pdf)
- Fine-tunining: When disabled, the gradient computatoin in the whole pretrained encoder is disabled
- Classification head: Number of layers and hidden dimensions. Each layer is followed by a ReLU activation.

## Backbone Implementation

Both models are based on their [Hugging Face Transformers](https://huggingface.co/docs/transformers) implementation.

The **Wav2Vec2** classifier extends [Wav2Vec2ForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForSequenceClassification) class, adding the support for a custom classification head.

![Wav2Vec2 Architecture](res/report/wav2vec2.png "Wav2Vec2 Architecture")

For **Whisper**, a classifier implementation didn't exists, so I used the internal [WhisperEncoder](https://huggingface.co/docs/transformers/model_doc/whisper) class and used the latent representation as features for classification.

![Whisper Architecture](res/report/whisper.png "Whisper Architecture")

# Training (Subset Dataset)

In order to try the different settings of configuration, I trained the models on genre classification using a subset of the whole dataset, containing 1000 audio files and just the top-3 frequent genres:
- World/Ethnic
- Rock/Blues
- Electronic

In [7]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_SUBSET)

Loading res/samples_clustered_genre3s1000.csv
999 examples in DataFrame
split
train    799
test     100
valid    100
dtype: int64


## Wav2Vec2

In [8]:
TRAINING_CONFIG["feature_encoder"] = "wav2vec2"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [None]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_SUBSET, df)

prepared_ds

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Fine Tuning

In [11]:
TRAINING_CONFIG["freeze_encoder"] = False 

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Classification Head

In [13]:
TRAINING_CONFIG["freeze_encoder"] = False
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

![Wav2Vec2 Loss](res/report/wav2vec2_l.png "Wav2Vec2 Loss")
![Wav2Vec2 Accuracy](res/report/wav2vec2_a.png "Wav2Vec2 Accuracy")

## Whisper

In [15]:
TRAINING_CONFIG["feature_encoder"] = "whisper"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [None]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_SUBSET, df)

prepared_ds

## Frozen

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Fine-Tuning

In [18]:
TRAINING_CONFIG["freeze_encoder"] = False

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Classification Head

In [20]:
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

![Whisper Loss](res/report/whisper_l.png "Whisper Loss")
![Whisper Accuracy](res/report/whisper_a.png "Whisper Accuracy")

# Training (Whole Dataset)

In [22]:
TRAINING_CONFIG = {
    "epochs": 3,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": "whisper",
    "freeze_encoder": False,
    "classifier_layers": [256], 
    "classifier_dropout": 0.0,
}

## Genre Classification

In [23]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_GEN)

df.head()

Loading res/samples_clustered_genre6.csv
16932 examples in DataFrame
split
train    13545
test      1694
valid     1693
dtype: int64


Unnamed: 0,path,duration,id,genre,split
0,01 Hip Hop/Abandoned Brass Stabs.mp3,7.262041,01_Hip_Hop_Abandoned_Brass_Stabs,Hip Hop/RnB,test
1,01 Hip Hop/Against Time Keys.mp3,6.948571,01_Hip_Hop_Against_Time_Keys,Hip Hop/RnB,train
2,01 Hip Hop/Against Time Piano.mp3,6.948571,01_Hip_Hop_Against_Time_Piano,Hip Hop/RnB,train
3,01 Hip Hop/Against Time Sax Sample.mp3,6.948571,01_Hip_Hop_Against_Time_Sax_Sample,Hip Hop/RnB,valid
4,01 Hip Hop/Against Time Staccato Strings.mp3,6.948571,01_Hip_Hop_Against_Time_Staccato_Strings,Hip Hop/RnB,train


In [None]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_GEN, df)

prepared_ds

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

![Genre Classification Loss](res/report/genre_l.png "Genre Classification Loss")
![Genre Classification Accuracy](res/report/genre_a.png "Genre Classification Accuracy")

## Category Classification

In [None]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_CAT)

df.head()

In [None]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_CAT, df)

prepared_ds

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

![Category Classification Loss](res/report/category_l.png "Category Classification Loss")
![Category Classification Accuracy](res/report/category_a.png "Category Classification Accuracy")