In [1]:
from random import randint
import warnings

from datasets import load_dataset, Audio
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CHECK_POINT = "./models/whisper-large-v3_ADReSSO/checkpoint-16"
MODEL = "whisper-large-v3_ADReSSO"
SAMPLE_DURATION = 30
BATCH_SIZE = 32
HALF_PRECISION = False

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained(CHECK_POINT)

preprocess = lambda examples: feature_extractor(
    [i["array"][(n := randint(0, len(i["array"]) - (m := min(len(i["array"]), feature_extractor.sampling_rate*SAMPLE_DURATION)))) : n + m] for i in examples["audio"]],
    sampling_rate=feature_extractor.sampling_rate,
    do_normalize=True,
    # max_length=16_000*args.sample_duration,
    # truncation=True,
)

In [4]:
test = load_dataset("nevikw39/ADReSSo", split="test").cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))

In [5]:
labels = test.features["label"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [6]:
model = AutoModelForAudioClassification.from_pretrained(
    CHECK_POINT, num_labels=num_labels, label2id=label2id, id2label=id2label
)

In [7]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
specificity = evaluate.load("nevikw39/specificity")

In [8]:
training_args = TrainingArguments(
    output_dir=f"./models/{MODEL}",
    per_device_eval_batch_size=BATCH_SIZE,
    push_to_hub_organization="NTHU-ML-2023-team19",
    push_to_hub=True,
    hub_private_repo=True,
)



In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=feature_extractor,
    compute_metrics=lambda eval_pred: (
        accuracy.compute(
            predictions=(pred := np.argmax(eval_pred.predictions, axis=1)),
            references=eval_pred.label_ids,
        ) | f1.compute(
            predictions=pred,
            references=eval_pred.label_ids,
        ) | specificity.compute(
            predictions=pred,
            references=eval_pred.label_ids,
        )
    ),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
trainer.evaluate(test.map(preprocess, remove_columns="audio", batched=True, load_from_cache_file=False))

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map: 100%|██████████| 71/71 [00:09<00:00,  7.75 examples/s]


{'eval_loss': 0.4751740097999573,
 'eval_accuracy': 0.8450704225352113,
 'eval_f1': 0.8405797101449276,
 'eval_specificity': 0.8857142857142857,
 'eval_runtime': 16.048,
 'eval_samples_per_second': 4.424,
 'eval_steps_per_second': 0.187}

In [24]:
trainer.save_model(f"./models/{MODEL}")
trainer.push_to_hub()

training_args.bin: 100%|██████████| 4.66k/4.66k [00:00<00:00, 16.2kB/s]


'https://huggingface.co/NTHU-ML-2023-team19/whisper-large-v3_ADRESSO/tree/main/'