In [1]:
import os
import zipfile
import numpy as np
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
# Step 1: Download & Extract the Dataset
DATASET_URL = "https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip"
DATASET_PATH = "fsdd"

if not os.path.exists(DATASET_PATH):
    print("Downloading dataset...")
    response = requests.get(DATASET_URL)
    with open("fsdd.zip", "wb") as f:
        f.write(response.content)

    with zipfile.ZipFile("fsdd.zip", "r") as zip_ref:
        zip_ref.extractall(DATASET_PATH)

    print("Dataset extracted successfully.")

In [5]:
pip install torch torchaudio transformers librosa datasets numpy tqdm




In [7]:
import os
import zipfile
import requests

DATASET_URL = "https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip"
DATASET_PATH = "fsdd"

# Download dataset
if not os.path.exists("fsdd.zip"):
    print("Downloading FSDD dataset...")
    response = requests.get(DATASET_URL)
    with open("fsdd.zip", "wb") as file:
        file.write(response.content)

# Extract dataset
if not os.path.exists(DATASET_PATH):
    print("Extracting dataset...")
    with zipfile.ZipFile("fsdd.zip", "r") as zip_ref:
        zip_ref.extractall(".")
    os.rename("free-spoken-digit-dataset-master", DATASET_PATH)

print("Dataset ready!")

Dataset ready!


In [9]:
import os

dataset_path = "fsdd/recordings"
if os.path.exists(dataset_path):
    print(f"✅ Dataset found at: {dataset_path}")
    print(f"Total files: {len(os.listdir(dataset_path))}")
else:
    print("❌ Dataset not found! Please re-download.")

✅ Dataset found at: fsdd/recordings
Total files: 3000


In [11]:
from datasets import Dataset
import pandas as pd
import librosa
import os

# Define dataset path
DATASET_PATH = "fsdd/free-spoken-digit-dataset-master/recordings"

# Get all audio files
audio_files = [os.path.join(DATASET_PATH, f) for f in os.listdir(DATASET_PATH) if f.endswith(".wav")]

# Extract labels from filenames (assuming filenames are like "0_jackson_0.wav")
def get_label(filename):
    return int(os.path.basename(filename).split("_")[0])  # Extract the first number as the label

# Load audio files and labels
audio_data = []
labels = []

for file in audio_files:
    y, sr = librosa.load(file, sr=16000)  # Load raw audio at 16kHz
    audio_data.append(y)  # Append raw audio waveform
    labels.append(get_label(file))

# Create train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(audio_data, labels, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_data = {"audio": X_train, "label": y_train}
test_data = {"audio": X_test, "label": y_test}

train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

print("Dataset Loaded Successfully!")


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


Dataset Loaded Successfully!


In [21]:
pip install ipywidgets

Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=24.6.0 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading webcolors-24.11.1-py3-none-any.whl.metadata (2.2 kB)
Downloading webcolors-24.11.1-py3-none-any.whl (14 kB)
Downl

In [57]:
# Step 3: Create Custom Dataset for Wav2Vec2
import torch

class FSDDDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, processor):
        self._data = data  # ✅ Ensure _data is assigned
        self.labels = labels
        self.processor = processor

    def __getitem__(self, idx):
        inputs = self.processor(self._data[idx], return_tensors="pt")  # ✅ Use _data
        inputs["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return inputs

    def __len__(self):
        return len(self._data)

train_dataset = FSDDDataset(X_train, y_train, processor)
test_dataset = FSDDDataset(X_test, y_test, processor)

In [59]:
train_dataset = FSDDDataset(X_train, y_train, processor)

In [61]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()  # Initialize the encoder

In [63]:
y_train_encoded = label_encoder.fit_transform(y_train)  # Fit and transform training labels
y_test_encoded = label_encoder.transform(y_test)  # Only transform test labels (don't fit again)

In [65]:
# Step 4: Load Pretrained Wav2Vec2 Model for Classification
num_classes = len(label_encoder.classes_)
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=num_classes)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
collate_fn=lambda batch: tuple(t.to(device) for t in zip(*batch))

In [81]:
from torch.nn.utils.rnn import pad_sequence

padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)

NameError: name 'inputs' is not defined

In [75]:
# Step 5: Fine-Tuning with Trainer API
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

from torch.nn.utils.rnn import pad_sequence

# Custom collate function to handle variable-length input tensors
def collate_fn(batch):
    input_values = [item["input_values"] for item in batch]
    labels = [item["labels"] for item in batch]
    
    # Pad input values to make them the same length
    input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)
    
    # Convert labels to tensor
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    
    return {"input_values": input_values_padded, "labels": labels_tensor}

# Use this function in Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    data_collator=collate_fn  # ✅ Now it's defined
)

# Train the model
trainer.train()

  trainer = Trainer(
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result

RuntimeError: The size of tensor a (4922) must match the size of tensor b (7556) at non-singleton dimension 1