### Audio Classification

In [19]:
!pip install transformers datasets evaluate soundfile librosa accelerate>=0.21.0

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [20]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [21]:
minds = minds.train_test_split(test_size=0.2)

In [22]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [23]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

In [24]:
minds["train"][0]

{'audio': {'path': '/home/mayur/.cache/huggingface/datasets/downloads/extracted/a399e838fe881600e06c31aba8f53d1e4377f7e590e83b31ba7ba3cd604d334e/en-US~APP_ERROR/602ba9fb963e11ccd901cd52.wav',
  'array': array([ 0.        ,  0.        , -0.00024414, ...,  0.00024414,
         -0.00024414, -0.00097656]),
  'sampling_rate': 8000},
 'intent_class': 2}

In [25]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [26]:
id2label[str(2)]

'app_error'

In [28]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [27]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '/home/mayur/.cache/huggingface/datasets/downloads/extracted/a399e838fe881600e06c31aba8f53d1e4377f7e590e83b31ba7ba3cd604d334e/en-US~APP_ERROR/602ba9fb963e11ccd901cd52.wav',
  'array': array([ 1.74496090e-05,  5.70755801e-05, -1.79835479e-05, ...,
         -7.72525324e-04, -9.36845026e-04, -5.74673410e-04]),
  'sampling_rate': 16000},
 'intent_class': 2}

In [29]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [30]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [31]:
import evaluate

accuracy = evaluate.load("accuracy")

In [32]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [33]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

model = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Checkpoint destination directory my_awesome_mind_model/checkpoint-3 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 2.6401984691619873, 'eval_accuracy': 0.05309734513274336, 'eval_runtime': 5.6897, 'eval_samples_per_second': 19.86, 'eval_steps_per_second': 0.703, 'epoch': 0.8}


  0%|          | 0/4 [00:00<?, ?it/s]

Checkpoint destination directory my_awesome_mind_model/checkpoint-6 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 2.6373984813690186, 'eval_accuracy': 0.07964601769911504, 'eval_runtime': 5.2871, 'eval_samples_per_second': 21.373, 'eval_steps_per_second': 0.757, 'epoch': 1.6}
{'train_runtime': 156.2684, 'train_samples_per_second': 5.759, 'train_steps_per_second': 0.038, 'train_loss': 2.6374799410502114, 'epoch': 1.6}


In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

In [35]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="./my_awesome_mind_model/checkpoint-6/")
classifier(audio_file)

[{'score': 0.07709634304046631, 'label': 'cash_deposit'},
 {'score': 0.07569773495197296, 'label': 'pay_bill'},
 {'score': 0.07477951794862747, 'label': 'business_loan'},
 {'score': 0.07359454780817032, 'label': 'joint_account'},
 {'score': 0.07229988276958466, 'label': 'freeze'}]