In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# Setup

In [2]:
import pandas as pd

from src.dataset import *
from src.train import *
from src.utils import *

In [3]:
RES_DIR_PATH = "res"
AUDIOS_DIR_PATH = os.path.join(RES_DIR_PATH, "mp3_data")
MODELS_DIR_PATH = os.path.join(RES_DIR_PATH, "models")
DATASETS_DIR_PATH = os.path.join(RES_DIR_PATH, "datasets")
NOTEBOOK_ENV = "jupyter"

CSV_PATH = os.path.join(RES_DIR_PATH, "samples.csv")

FEATURES_CONFIG = {
    "genre": {"top_n": 3, "samples": 1000}
}

VALID_SIZE = 0.1
TEST_SIZE = 0.1

## Backbones

## Fine-tuning

Thanks to the HuggingFace library, it would be enough to

## Classifier

## Multi-task

# Training

In [4]:
TRAINING_CONFIG = {
    "epochs": 20,
    "learning_rate": 5e-5,
    "warmup": 0.0,
    "train_batch_size": 8,
    "eval_batch_size": 16,
    "feature_encoder": None,
    "freeze_encoder": None,
    "classifier_layers": None, 
    "classifier_dropout": None,
}

In [5]:
# filtered_csv_path = get_csv_name(FEATURES_CONFIG, CSV_PATH)
# encoder_ds = {}

# for encoder in ["wav2vec2", "whisper"]:
#     encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{encoder}-full-encoded")

#     df = pd.read_csv(filtered_csv_path)
#     ds = datasets.load_from_disk(encoded_dataset_path)
#     ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": encoder})
#     encoder_ds[encoder] = prepare_ds(ds, df, FEATURES_CONFIG, 0.2, fixed_mapping=None, save=False)

In [6]:
filtered_csv_path = get_csv_name(FEATURES_CONFIG, CSV_PATH)

if os.path.exists(filtered_csv_path):
    print(f"Loading {filtered_csv_path}")
    df = pd.read_csv(filtered_csv_path)
else:
    df = pd.read_csv(CSV_PATH)
    df = filter_df(
        df, 
        remove_nones=False,
        features_config=FEATURES_CONFIG, 
    )
    df.to_csv(filtered_csv_path, index=False)

print(f"{len(df)} examples in DataFrame")

if "split" not in df.columns:
    df = split_df(df, validation_size=VALID_SIZE, test_size=TEST_SIZE)
    df.to_csv(CSV_PATH, index=False)

print(df.value_counts("split"))

Loading res/samples_genre3s1000.csv
999 examples in DataFrame
split
train    799
test     100
valid    100
dtype: int64


## Baseline

In [7]:
TRAINING_CONFIG["feature_encoder"] = "wav2vec2"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [8]:
encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{TRAINING_CONFIG['feature_encoder']}-full-encoded")
ds = datasets.load_from_disk(encoded_dataset_path)
ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": TRAINING_CONFIG['feature_encoder']})
prepared_ds = prepare_ds(ds, df, FEATURES_CONFIG, fixed_mapping=None, save=False)

prepared_ds

Removing extra columns from dataset
Extracting train split
Extracting test split
Extracting valid split
Create `ClassLabels` for target classes




Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 799
    })
    test: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['label', 'id', 'duration', 'input_values'],
        num_rows: 100
    })
})

In [17]:
run_name = "baseline"
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceMultiClassification: ['project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_q.bias', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceMultiClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'projecto

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 799
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 394499
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0921,1.075622,0.52
2,1.0715,1.050573,0.52
3,1.0485,1.038513,0.53
4,1.0279,1.017008,0.51
5,1.0121,1.002827,0.53
6,0.9906,0.994052,0.53
7,0.9775,0.982229,0.56
8,0.9622,0.970693,0.56
9,0.9506,0.959028,0.57
10,0.9438,0.961429,0.56


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to out/checkpoint-100
Configuration saved in out/checkpoint-100/config.json
Model weights saved in out/checkpoint-100/pytorch_model.bin
Feature extractor saved in out/checkpoint-100/preprocessor_config.json
Deleting older checkpoint [out/checkpoint-400] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num ex

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▂▂▂▁▂▂▅▅▅▅▄▇▇█▇▇▇███
eval/loss,█▇▆▅▄▄▃▃▂▂▃▂▁▁▁▁▁▁▁▁
eval/runtime,▂▁█████▁█▁█▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▇█▁▁▁▁▁█▁█▁█████████
eval/steps_per_second,▇█▁▁▁▁▁█▁█▁█████████
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,██▇▆▆▅▄▄▄▃▃▂▂▂▂▁▁▂▂▁▁
train/total_flos,▁

0,1
eval/accuracy,0.61
eval/loss,0.93327
eval/runtime,2.302
eval/samples_per_second,43.44
eval/steps_per_second,3.041
train/epoch,20.0
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.8832
train/total_flos,1.2353545257109258e+18


Saving model checkpoint to res/models/baseline-test
Configuration saved in res/models/baseline-test/config.json
Model weights saved in res/models/baseline-test/pytorch_model.bin
Feature extractor saved in res/models/baseline-test/preprocessor_config.json


## Wav2Vec2

## Fine Tuning

In [10]:
TRAINING_CONFIG["freeze_encoder"] = False 

In [11]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceMultiClassification: ['quantizer.weight_proj.bias', 'project_hid.bias', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_hid.weight', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceMultiClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 799
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 94766211
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0874,1.030346,0.56
2,0.9056,0.796369,0.7
3,0.769,0.925771,0.63
4,0.7191,1.003863,0.65
5,0.6482,0.744893,0.7
6,0.6997,0.776893,0.71
7,0.5674,0.828011,0.7
8,0.4986,0.842151,0.7
9,0.4219,0.913023,0.73
10,0.3503,0.951237,0.71


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to out/checkpoint-100
Configuration saved in out/checkpoint-100/config.json
Model weights saved in out/checkpoint-100/pytorch_model.bin
Feature extractor saved in out/checkpoint-100/preprocessor_config.json
Deleting older checkpoint [out/checkpoint-1700] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num e

VBox(children=(Label(value='0.002 MB of 0.030 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.079234…

0,1
eval/accuracy,▁▅▃▃▅▅▅▅▆▅▆▇▇▇█▇██▇█
eval/loss,█▂▅▇▁▂▃▃▅▆▄▅▅▆▅▇▅▆█▆
eval/runtime,▁████▁▁██▁▁██▁▁▁█▃▁▁
eval/samples_per_second,▇▁▁▁▁██▁▁██▁▁▇██▁▄██
eval/steps_per_second,▇▁▁▁▁██▁▁██▁▁▇██▁▄██
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,██▇▆▅▅▅▄▄▄▃▃▂▂▂▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.82
eval/loss,0.95187
eval/runtime,2.2352
eval/samples_per_second,44.738
eval/steps_per_second,3.132
train/epoch,20.0
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.0467
train/total_flos,1.2322520770765046e+18


Saving model checkpoint to res/models/wav2vec2-non-frozen-c256-d0-20230222-223307
Configuration saved in res/models/wav2vec2-non-frozen-c256-d0-20230222-223307/config.json
Model weights saved in res/models/wav2vec2-non-frozen-c256-d0-20230222-223307/pytorch_model.bin
Feature extractor saved in res/models/wav2vec2-non-frozen-c256-d0-20230222-223307/preprocessor_config.json


## Classification Head

In [9]:
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [20]:
%env WANDB_MODE=offline

env: WANDB_MODE=offline


In [10]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceMultiClassification: ['project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_q.bias', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceMultiClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceMultiClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'head.lay

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 799
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 460291
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0956,1.091925,0.38
2,1.0827,1.081119,0.37
3,1.0557,1.052241,0.44
4,1.02,1.041526,0.4
5,0.9819,1.015952,0.45
6,0.9489,1.005291,0.45
7,0.9237,0.990533,0.44
8,0.8976,0.982548,0.44
9,0.8777,0.976734,0.47
10,0.8571,0.946517,0.5


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to out/checkpoint-100
Configuration saved in out/checkpoint-100/config.json
Model weights saved in out/checkpoint-100/pytorch_model.bin
Feature extractor saved in out/checkpoint-100/preprocessor_config.json
Deleting older checkpoint [out/checkpoint-1700] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceMultiClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `Wav2Vec2ForSequenceMultiClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num e

0,1
eval/accuracy,▁▁▃▂▃▃▃▃▄▅▅▅▇▇▇█████
eval/loss,██▇▆▅▅▅▄▄▃▂▂▂▂▁▁▁▁▁▁
eval/runtime,█▁███████▁▁▁██▁▁▁▁▁▁
eval/samples_per_second,▁█▁▁▁▁▁▁▁███▁▁██████
eval/steps_per_second,▁█▁▁▁▁▁▁▁███▁▁██████
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,███▇▆▅▅▄▃▃▃▂▂▂▂▂▁▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.61
eval/loss,0.88276
eval/runtime,2.1635
eval/samples_per_second,46.221
eval/steps_per_second,3.235
train/epoch,20.0
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.7875
train/total_flos,1.2432971999006264e+18


Saving model checkpoint to res/models/wav2vec2-frozen-c256-256-d0-20230222-230801
Configuration saved in res/models/wav2vec2-frozen-c256-256-d0-20230222-230801/config.json
Model weights saved in res/models/wav2vec2-frozen-c256-256-d0-20230222-230801/pytorch_model.bin
Feature extractor saved in res/models/wav2vec2-frozen-c256-256-d0-20230222-230801/preprocessor_config.json


## Whisper

In [11]:
TRAINING_CONFIG["feature_encoder"] = "whisper"
TRAINING_CONFIG["freeze_encoder"] = True
TRAINING_CONFIG["classifier_layers"] = [256]
TRAINING_CONFIG["classifier_dropout"] = 0

In [12]:
encoded_dataset_path = os.path.join(DATASETS_DIR_PATH, f"ds-{TRAINING_CONFIG['feature_encoder']}-full-encoded")
ds = datasets.load_from_disk(encoded_dataset_path)
ds = add_audio_column(ds, audios_dir_path=AUDIOS_DIR_PATH, training_config={"feature_encoder": TRAINING_CONFIG['feature_encoder']})
prepared_ds = prepare_ds(ds, df, FEATURES_CONFIG, fixed_mapping=None, save=False)

prepared_ds

loading configuration file preprocessor_config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/preprocessor_config.json
Feature extractor WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "mel_filters": [
    [
      -0.0,
      0.02486259490251541,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0

Removing extra columns from dataset
Extracting train split
Extracting test split
Extracting valid split
Create `ClassLabels` for target classes


Casting the dataset:   0%|          | 0/799 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 799
    })
    test: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['label', 'id', 'duration', 'input_features'],
        num_rows: 100
    })
})

## Frozen

In [30]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/151M [00:00<?, ?B/s]

## Fine-Tuning

In [13]:
TRAINING_CONFIG["freeze_encoder"] = False

In [14]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

PyTorch: setting up devices
The following columns in the training set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 799
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 8307715
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8029,0.664767,0.76
2,0.4566,0.56712,0.81
3,0.3793,0.67337,0.84
4,0.2618,0.751586,0.84
5,0.1699,0.883249,0.82
6,0.1026,0.851813,0.82
7,0.0344,0.850684,0.83
8,0.0141,1.135965,0.83
9,0.0255,0.973529,0.85
10,0.0091,1.0089,0.83


The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to out/checkpoint-100
Configuration saved in out/checkpoint-100/config.json
Model weights saved in out/checkpoint-100/pytorch_model.bin
Feature extractor saved in out/checkpoint-100/preprocessor_config.json
Deleting older checkpoint [out/checkpoint-1700] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch si

VBox(children=(Label(value='0.002 MB of 0.013 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.189627…

0,1
eval/accuracy,▁▄▆▆▅▅▅▅▆▅▇█████████
eval/loss,▂▁▂▃▅▅▄█▆▆▅▅▅▅▆▆▆▆▆▆
eval/runtime,▁████▇▇█▇▇▇▇▇█▇▇▇▇█▁
eval/samples_per_second,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
eval/steps_per_second,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,█▆▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.88
eval/loss,0.96094
eval/runtime,1.7326
eval/samples_per_second,57.716
eval/steps_per_second,4.04
train/epoch,20.0
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.0001
train/total_flos,5.9305346736e+16


Saving model checkpoint to res/models/whisper-non-frozen-c256-d0-20230222-232339
Configuration saved in res/models/whisper-non-frozen-c256-d0-20230222-232339/config.json
Model weights saved in res/models/whisper-non-frozen-c256-d0-20230222-232339/pytorch_model.bin
Feature extractor saved in res/models/whisper-non-frozen-c256-d0-20230222-232339/preprocessor_config.json


## Classification Head

In [15]:
TRAINING_CONFIG["classifier_layers"] = [256, 256]

In [16]:
run_name = get_run_name(TRAINING_CONFIG)
model = get_model(TRAINING_CONFIG, prepared_ds["train"])

trainer = get_trainer(
    run_name=run_name,
    model=model,
    train_ds=prepared_ds["train"],
    eval_ds=prepared_ds["valid"],
    training_config=TRAINING_CONFIG,
    feature_extractor=None,
    output_dir="out",
    debug=True,
    env=NOTEBOOK_ENV,
)

trainer.train()
end_training(run_name, trainer, MODELS_DIR_PATH)

loading configuration file config.json from cache at /home/alesssandros/.cache/huggingface/hub/models--openai--whisper-tiny/snapshots/ada5a5d516772e41f9aeb0f984df6ecc4620001f/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-tiny",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 384,
  "decoder_attention_heads": 6,
  "decoder_ffn_dim": 1536,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 6,
  "encoder_ffn_dim": 1536,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 4,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_lengt

PyTorch: setting up devices
The following columns in the training set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 799
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 8373507
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9561,0.708558,0.78
2,0.6438,0.646819,0.76
3,0.5065,0.612002,0.77
4,0.4243,0.737755,0.79
5,0.3238,0.735815,0.8
6,0.2685,0.720123,0.83
7,0.177,0.676495,0.85
8,0.1353,0.955557,0.84
9,0.0748,0.854587,0.88
10,0.048,0.950574,0.84


The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to out/checkpoint-100
Configuration saved in out/checkpoint-100/config.json
Model weights saved in out/checkpoint-100/pytorch_model.bin
Feature extractor saved in out/checkpoint-100/preprocessor_config.json
Deleting older checkpoint [out/checkpoint-1200] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `WhisperForSequenceClassification.forward` and have been ignored: duration, id. If duration, id are not expected by `WhisperForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch si

0,1
eval/accuracy,▂▁▂▃▃▅▆▆█▆▆▆▆▆▆▆▆▆▆▆
eval/loss,▂▁▁▂▂▂▂▅▄▅▆▇▇▇██████
eval/runtime,█████████▁██▁██▂████
eval/samples_per_second,▁▁▁▁▁▁▁▁▁█▁▁█▁▁▆▁▁▁▁
eval/steps_per_second,▁▁▁▁▁▁▁▁▁█▁▁█▁▁▆▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/loss,█▇▅▄▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/accuracy,0.85
eval/loss,1.20529
eval/runtime,4.7677
eval/samples_per_second,20.975
eval/steps_per_second,1.468
train/epoch,20.0
train/global_step,2000.0
train/learning_rate,0.0
train/loss,0.0002
train/total_flos,5.98099976928e+16


Saving model checkpoint to res/models/whisper-non-frozen-c256-256-d0-20230222-233942
Configuration saved in res/models/whisper-non-frozen-c256-256-d0-20230222-233942/config.json
Model weights saved in res/models/whisper-non-frozen-c256-256-d0-20230222-233942/pytorch_model.bin
Feature extractor saved in res/models/whisper-non-frozen-c256-256-d0-20230222-233942/preprocessor_config.json
