In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [3]:
import pandas as pd

from src.dataset import *
from src.train import *
from src.utils import *

In [20]:
RES_DIR_PATH = "res"
AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")
NOTEBOOK_ENV = "jupyter"
TOP_N_GENRES = 6
TOP_N_FEATURES = 9

CSV_PATH = os.path.join(RES_DIR_PATH, "samples_clustered.csv")

FEATURES_CONFIG_SUBSET = {
    "genre": {"top_n": 3, "samples": 1000}
}

FEATURES_CONFIG_GEN = {
    "genre": {"top_n": TOP_N_GENRES, "samples": None}
}

FEATURES_CONFIG_CAT = {
    "category": {"top_n": TOP_N_FEATURES, "samples": None}
}

FEATURES_CONFIG_MULTI = {
    "genre": {"top_n": TOP_N_GENRES, "samples": None},
    "category": {"top_n": TOP_N_FEATURES, "samples": None}
}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

## Backbones

The two considered backbones are [Wav2Vec2](https://arxiv.org/abs/2006.11477) and [Whisper](https://cdn.openai.com/papers/whisper.pdf).

Both models are used through the [Hugging Face Transformers](https://huggingface.co/docs/transformers) library.

The implementation of the **Wav2Vec2** classifier follows the one in the [Wav2Vec2ForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForSequenceClassification) class, adding the support for a custom classification head.

Regarding **Whisper**, I took the outputs from the [WhisperEncoder](https://huggingface.co/docs/transformers/model_doc/whisper) class and used them right away.

## Fine-tuning

For both of the backbones, when freezing them, the gradient computation of the entire encoder was disabled.

## Classifier

The classifier is implemented through an MLP, with variable layer size and hidden dimensions.
Each layer is followed by an optional Dropout layer and a ReLU activation.

## Multi-task

TODO

# Training 1

In [21]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": None,
}

In [22]:
def create_or_load_df(features_config):
    filtered_csv_path = get_csv_name(features_config, CSV_PATH)

    # If the subset is already in the filesystem, load it directly
    if os.path.exists(filtered_csv_path):
        print(f"Loading {filtered_csv_path}")
        df = pd.read_csv(filtered_csv_path)
    else:
        df = pd.read_csv(CSV_PATH)
        # Filter the dataset according to the given configuration and remove rows containing null values
        df = filter_df(
            df, 
            remove_nones=True,
            features_config=features_config, 
        )
        df.to_csv(filtered_csv_path, index=False)

    print(f"{len(df)} examples in DataFrame")
    # If the split column is not in the dataset, split the dataset into three partisions using 
    # `TEST_SIZE` and `VALID_SIZE` and save the result

    if "split" not in df.columns:
        df = split_df(df, validation_size=VALID_SIZE, test_size=TEST_SIZE)
        df.to_csv(filtered_csv_path, index=False)

    print(df.value_counts("split"))
    return df

In [33]:
# Create a function for loading the dataset for the requested model

def load_and_prepare_ds(training_config, feature_config, df, clustered=True):
    encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{training_config['feature_encoder']}-full-encoded")
    ds = datasets.load_from_disk(encoded_dataset_path)
    ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": training_config['feature_encoder']})
    return prepare_ds(ds, df, feature_config, clustered=clustered, fixed_mapping=None, save=False)

In [24]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_SUBSET)

20636 total samples in dataset
24 features considered
Considering only rows without non-available values, 0 samples discarded
Keeping only the 3 most frequent values of genre
Sampled 999 items
Applying stratified sampling to the database
999 total samples left
999 examples in DataFrame
split
train    799
test     100
valid    100
dtype: int64


## Wav2Vec2

In [8]:
TRAINING_CONFIG["feature_encoder"] = "wav2vec2"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [9]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURE_CONFIG_SUBSET, df)

prepared_ds

Removing extra columns from dataset
Extracting valid split
Extracting train split
Extracting test split
Create `ClassLabels` for target classes




Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    valid: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 100
    })
    train: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 799
    })
    test: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 100
    })
})

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Fine Tuning

In [10]:
TRAINING_CONFIG["freeze_encoder"] = False 

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Classification Head

In [10]:
TRAINING_CONFIG["freeze_encoder"] = False
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Whisper

In [12]:
TRAINING_CONFIG["feature_encoder"] = "whisper"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [13]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURE_CONFIG_SUBSET, df)

prepared_ds

loading configuration file preprocessor_config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/preprocessor_config.json
Feature extractor WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "mel_filters": [
    [
      -0.0,
      0.02486259490251541,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0

Removing extra columns from dataset
Extracting valid split
Extracting train split
Extracting test split
Create `ClassLabels` for target classes


Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    valid: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 100
    })
    train: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 799
    })
    test: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 100
    })
})

## Frozen

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Fine-Tuning

In [13]:
TRAINING_CONFIG["freeze_encoder"] = False

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Classification Head

In [15]:
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

# Training 2

In [25]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": "whisper",
    "freeze_encoder": False,
    "classifier_layers": [256], 
    "classifier_dropout": 0.0,
}

## Genre Classification

In [26]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_GEN)

df.head()

20636 total samples in dataset
24 features considered
Considering only rows without non-available values, 0 samples discarded
Keeping only the 6 most frequent values of genre
16932 total samples left
16932 examples in DataFrame
split
train    13545
test      1694
valid     1693
dtype: int64


Unnamed: 0,path,duration,id,genre,split
0,01 Hip Hop/Abandoned Brass Stabs.mp3,7.262041,01_Hip_Hop_Abandoned_Brass_Stabs,Hip Hop/RnB,train
1,01 Hip Hop/Against Time Keys.mp3,6.948571,01_Hip_Hop_Against_Time_Keys,Hip Hop/RnB,train
2,01 Hip Hop/Against Time Piano.mp3,6.948571,01_Hip_Hop_Against_Time_Piano,Hip Hop/RnB,test
3,01 Hip Hop/Against Time Sax Sample.mp3,6.948571,01_Hip_Hop_Against_Time_Sax_Sample,Hip Hop/RnB,train
4,01 Hip Hop/Against Time Staccato Strings.mp3,6.948571,01_Hip_Hop_Against_Time_Staccato_Strings,Hip Hop/RnB,train


In [42]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_GEN, df)

prepared_ds

Removing extra columns from dataset
Mapping features clusters


  0%|          | 0/20636 [00:00<?, ?ex/s]

Extracting train split
Extracting test split
Extracting valid split
Create `ClassLabels` for target classes
{'genre': ClassLabel(names=['Electronic', 'Hip Hop/RnB', 'House', 'Orchestral', 'Rock/Blues', 'World/Ethnic'], id=None)}


Casting the dataset:   0%|          | 0/14 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 13545
    })
    test: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 1694
    })
    valid: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 1693
    })
})

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

## Category Classification

In [None]:
# Build the filename indicating the subset of the whole dataset with the specific configurations
df = create_or_load_df(FEATURES_CONFIG_CAT)

df.head()

In [None]:
prepared_ds = load_and_prepare_ds(TRAINING_CONFIG, FEATURES_CONFIG_CAT, df)

prepared_ds

In [None]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    output_dir="out",
    debug=False,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)