In [1]:
from datasets import load_dataset, Audio
from transformers import ASTForAudioClassification, AutoFeatureExtractor, Trainer, TrainingArguments, DefaultDataCollator # noqa: F401
import matplotlib.pyplot as plt # noqa: F401

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("gilkeyio/AudioMNIST")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = ds['train'].features["audio"].sampling_rate

In [3]:
df = ds['train'].to_pandas()
df.head()

Unnamed: 0,speaker_id,audio,digit,gender,accent,age,native_speaker,origin
0,59,{'bytes': b'RIFF 3\x01\x00WAVEfmt \x10\x00\x00...,7,1,German,31.0,False,"Europe, Germany, Berlin"
1,59,{'bytes': b'RIFF\xb0<\x01\x00WAVEfmt \x10\x00\...,7,1,German,31.0,False,"Europe, Germany, Berlin"
2,59,{'bytes': b'RIFFZ\xce\x00\x00WAVEfmt \x10\x00\...,2,1,German,31.0,False,"Europe, Germany, Berlin"
3,59,{'bytes': b'RIFFV\xfb\x00\x00WAVEfmt \x10\x00\...,3,1,German,31.0,False,"Europe, Germany, Berlin"
4,59,{'bytes': b'RIFF\x02\x1a\x01\x00WAVEfmt \x10\x...,9,1,German,31.0,False,"Europe, Germany, Berlin"


In [4]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

def preprocessing(input):
    output = feature_extractor(input["audio"]["array"], sampling_rate = sampling_rate)
    return {
        "input_values": output["input_values"][0],
        "labels": input["digit"] ,
    }

# test_result = preprocessing(ds['train'][0])
# print("Keys:", test_result.keys())
# print("Input type:", type(test_result['input_values']))
# print("Input shape:", test_result['input_values'].shape)
# print("Label:", test_result['labels'])



In [5]:
ds = ds.map(preprocessing)

In [6]:
training_args = TrainingArguments(
    do_train=True,
    output_dir="./model",
    learning_rate=2e-5,
    eval_strategy="epoch",
    per_device_train_batch_size=200,
    per_device_eval_batch_size=124,
    num_train_epochs=5,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True
)

In [7]:
data_collator = DefaultDataCollator()

trainer = Trainer(
    model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593",
                                                      num_labels=10,
                                                      ignore_mismatched_sizes=True),
    args = training_args,
    train_dataset = ds["train"],
    eval_dataset = ds["test"],
    data_collator = data_collator
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

