In [2]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train", trust_remote_code=True)

Generating train split: 563 examples [00:00, 5923.33 examples/s]


In [3]:
minds = minds.train_test_split(test_size=0.2)
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [15]:
# minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
# minds["train"][0]
minds['train'][0]

{'audio': {'path': 'C:\\Users\\kdens\\.cache\\huggingface\\datasets\\downloads\\extracted\\fd08e29aaa3b44a5a0f6cbfccad7533115bed232f7495fea85462e6210e49468\\en-US~BUSINESS_LOAN\\602bacb35f67b421554f6489.wav',
  'array': array([-2.27456912e-05, -3.18052371e-05,  2.38829780e-05, ...,
         -3.88225017e-04, -4.53380460e-04, -3.00488173e-04]),
  'sampling_rate': 16000},
 'intent_class': 5}

In [5]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
id2label[str(2)]

'app_error'

In [6]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]



{'audio': {'path': 'C:\\Users\\kdens\\.cache\\huggingface\\datasets\\downloads\\extracted\\fd08e29aaa3b44a5a0f6cbfccad7533115bed232f7495fea85462e6210e49468\\en-US~BUSINESS_LOAN\\602bacb35f67b421554f6489.wav',
  'array': array([-2.27456912e-05, -3.18052371e-05,  2.38829780e-05, ...,
         -3.88225017e-04, -4.53380460e-04, -3.00488173e-04]),
  'sampling_rate': 16000},
 'intent_class': 5}

In [7]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [8]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map: 100%|██████████| 450/450 [00:07<00:00, 59.27 examples/s]
Map: 100%|██████████| 113/113 [00:01<00:00, 62.28 examples/s]


In [16]:
type(encoded_minds)

datasets.dataset_dict.DatasetDict

In [9]:
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [10]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 