In [1]:
%pip install transformers datasets evaluate soundfile librosa accelerate>=0.21.0

zsh:1: 0.21.0 not found
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
minds = minds.train_test_split(test_size=0.2)

In [4]:
minds


DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [5]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
minds["train"][0]

{'audio': {'path': '/Users/apple/.cache/huggingface/datasets/downloads/extracted/080571a12566d7e5bc7dda1227e13bd8d8cf9c611d09e080a985718838c09fb0/en-US~FREEZE/602baf3f963e11ccd901ce35.wav',
  'array': array([-0.00024414, -0.00024414,  0.        , ...,  0.00024414,
          0.00048828, -0.00048828]),
  'sampling_rate': 8000},
 'intent_class': 9}

In [6]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
id2label[str(2)]


'app_error'

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [None]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '/Users/apple/.cache/huggingface/datasets/downloads/extracted/080571a12566d7e5bc7dda1227e13bd8d8cf9c611d09e080a985718838c09fb0/en-US~FREEZE/602baf3f963e11ccd901ce35.wav',
  'array': array([-2.33099097e-04, -3.00521497e-04, -2.55959865e-04, ...,
         -7.15097121e-05, -4.58146387e-04, -3.67233995e-04]),
  'sampling_rate': 16000},
 'intent_class': 9}

In [11]:
import numpy as np
def preprocess_function(examples):
    """
    Preprocess audio examples for Wav2Vec2 model
    """
    # Convert audio arrays to numpy arrays
    audio_arrays = [np.array(x["array"]) for x in examples["audio"]]
    
    # Get features from feature extractor
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=16000, 
        truncation=True,
        padding=True,  # Add padding
        return_tensors="np"
    )
    
    return {
        "input_values": inputs.input_values,
    }
# Reprocess your dataset with the updated function
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

# Verify the data format
sample = encoded_minds["train"][0]
print("\nVerifying data format:")
print("Input values type:", type(sample["input_values"]))
print("Input values shape:", np.array(sample["input_values"]).shape)
print("Label:", sample["label"])

Map: 100%|██████████| 450/450 [00:00<00:00, 1118.33 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 675.87 examples/s]


Verifying data format:
Input values type: <class 'list'>
Input values shape: (16000,)
Label: 9





In [12]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
    """
    Compute accuracy metrics for audio classification predictions
    """
    predictions = eval_pred.predictions
    
    # Convert to numpy if it's a tensor
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.detach().cpu().numpy()
    
    # If predictions are 3D (batch, sequence_length, num_labels)
    if len(predictions.shape) == 3:
        predictions = predictions.mean(axis=1)  # Average over sequence length
    
    # Get predicted class indices
    predicted_classes = np.argmax(predictions, axis=-1)
    
    # Make sure labels are in the right format
    labels = eval_pred.label_ids
    if len(labels.shape) > 1:
        labels = labels.squeeze()
    
    # Compute accuracy
    return accuracy.compute(predictions=predicted_classes, references=labels)

In [15]:
from transformers import Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer

num_labels = len(id2label)
print(f"Number of labels: {num_labels}")
print("Label mappings:")
print("id2label:", id2label)
print("label2id:", label2id)

# Initialize the model
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)
# Print model config
print("\nModel config:")
print(model.config)

Number of labels: 14
Label mappings:
id2label: {'0': 'abroad', '1': 'address', '2': 'app_error', '3': 'atm_limit', '4': 'balance', '5': 'business_loan', '6': 'card_issues', '7': 'cash_deposit', '8': 'direct_debit', '9': 'freeze', '10': 'high_value_payment', '11': 'joint_account', '12': 'latest_transactions', '13': 'pay_bill'}
label2id: {'abroad': '0', 'address': '1', 'app_error': '2', 'atm_limit': '3', 'balance': '4', 'business_loan': '5', 'card_issues': '6', 'cash_deposit': '7', 'direct_debit': '8', 'freeze': '9', 'high_value_payment': '10', 'joint_account': '11', 'latest_transactions': '12', 'pay_bill': '13'}


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model config:
Wav2Vec2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_attn_dim": null,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_dropout"

In [None]:
import accelerate

accelerate.__version__

'0.26.0'

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,  # Even smaller batch size
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,  # Increased gradient accumulation
    num_train_epochs=2,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_first_step=True,
    # Add fp16 for better memory efficiency
    fp16=True,
    dataloader_pin_memory=False,
)

# Initialize trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training...")
trainer_output = trainer.train()



  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting training...


  0%|          | 0/28 [00:00<?, ?it/s]

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.