In [11]:
pip install torch torchaudio transformers datasets soundfile




In [13]:
import os
import glob
import torch
import torchaudio
import soundfile as sf
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor

# Load Wav2Vec2 Processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

class FSDDDataset(Dataset):
    def __init__(self, dataset_path="fsdd/free-spoken-digit-dataset-master/recordings"):
        self.files = glob.glob(os.path.join(dataset_path, "*.wav"))
        self.labels = [int(os.path.basename(f)[0]) for f in self.files]  # Extract digit from filename

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        # Load audio file
        waveform, sample_rate = torchaudio.load(self.files[idx])

        # Resample if needed (Wav2Vec2 expects 16kHz)
        if sample_rate != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

        # Convert to numpy array for Wav2Vec2
        waveform = waveform.squeeze(0).numpy()

        # Process input with Wav2Vec2Processor
        inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

        # Convert label to tensor
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return inputs.input_values.squeeze(0), label

# Create dataset and dataloader
dataset = FSDDDataset()
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

print(f"Loaded {len(dataset)} samples.")

Loaded 3000 samples.


In [15]:
from transformers import Wav2Vec2ForSequenceClassification

# Load model
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h",
    num_labels=10  # Digits 0-9
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(model)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [17]:
import torch.nn as nn
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 5  # Adjust based on performance
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs).logits  # Get predictions
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader):.4f}")

print("Training complete!")


RuntimeError: stack expects each tensor to be equal size, but got [8634] at entry 0 and [5880] at entry 1

In [19]:
correct = 0
total = 0

model.eval()
with torch.no_grad():
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs).logits
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")


RuntimeError: stack expects each tensor to be equal size, but got [5810] at entry 0 and [4040] at entry 1