In [1]:
pip install librosa soundfile

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import pandas as pd
import io
import librosa
import soundfile as sf
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import torchaudio
import torchaudio.transforms as T
from sklearn.preprocessing import LabelEncoder
from io import BytesIO


splits = {'train': 'data/train-00000-of-00001.parquet', 
          'validation': 'data/validation-00000-of-00001.parquet', 
          'test': 'data/test-00000-of-00001.parquet'}

df_train = pd.read_parquet("hf://datasets/hanifa-fy/english_accent_samples/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/hanifa-fy/english_accent_samples/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/hanifa-fy/english_accent_samples/" + splits["test"])


2025-05-04 17:35:28.055737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746380128.237816      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746380128.290820      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
label_encoder = LabelEncoder()
df_train['accent'] = label_encoder.fit_transform(df_train['accent'])
df_val['accent'] = label_encoder.transform(df_val['accent'])
df_test['accent'] = label_encoder.transform(df_test['accent'])
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

import pickle
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

In [4]:
class Wav2Vec2AccentDataset(Dataset):
    def __init__(self, dataframe, processor, label2id, max_length=16000*5):
        self.dataframe = dataframe
        self.processor = processor
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        audio_bytes = self.dataframe.iloc[idx]["audio"]['bytes']
        label = self.dataframe.iloc[idx]["accent"]

        waveform, sr = torchaudio.load(BytesIO(audio_bytes))  
        waveform = waveform.squeeze(0)

        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform = resampler(waveform)

        if waveform.shape[0] > self.max_length:
            waveform = waveform[:self.max_length]
        else:
            pad_len = self.max_length - waveform.shape[0]
            waveform = torch.nn.functional.pad(waveform, (0, pad_len))

        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values = inputs.input_values.squeeze(0)

        label_id = self.label2id[label]

        return input_values, label_id

In [5]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

train_dataset = Wav2Vec2AccentDataset(df_train, processor, label_mapping)
val_dataset = Wav2Vec2AccentDataset(df_val, processor, label_mapping)
test_dataset = Wav2Vec2AccentDataset(df_test, processor, label_mapping)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)



preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [6]:
label_list = list(label_encoder.classes_)
num_labels = len(label_list)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=num_labels,
    problem_type="single_label_classification",
)

model.freeze_feature_extractor()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [7]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=10):
    for epoch in range(epochs):
        model.train()  
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for batch in train_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_values=inputs)
            
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs.logits, 1)
            total_train_correct += (predicted == labels).sum().item()
            total_train_samples += labels.size(0)
            total_train_loss += loss.item()

        train_loss = total_train_loss / len(train_loader)
        train_accuracy = total_train_correct / total_train_samples

        model.eval()  
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(input_values=inputs)

                loss = criterion(outputs.logits, labels)

                _, predicted = torch.max(outputs.logits, 1)
                total_val_correct += (predicted == labels).sum().item()
                total_val_samples += labels.size(0)
                total_val_loss += loss.item()

        val_loss = total_val_loss / len(val_loader)
        val_accuracy = total_val_correct / total_val_samples

        print(f"Epoch [{epoch+1}/{epochs}]")
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

    return model

def evaluate_test_set(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(input_values=inputs)
            _, predicted = torch.max(outputs.logits, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

model = train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=15)

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Epoch [1/15]
Train Loss: 2.5678, Train Accuracy: 0.3216
Val Loss: 2.1820, Val Accuracy: 0.4784
Epoch [2/15]
Train Loss: 1.9048, Train Accuracy: 0.5031
Val Loss: 1.7112, Val Accuracy: 0.5086
Epoch [3/15]
Train Loss: 1.4576, Train Accuracy: 0.5816
Val Loss: 1.2659, Val Accuracy: 0.6379
Epoch [4/15]
Train Loss: 1.1671, Train Accuracy: 0.6552
Val Loss: 1.1149, Val Accuracy: 0.6530
Epoch [5/15]
Train Loss: 0.9555, Train Accuracy: 0.6987
Val Loss: 0.9791, Val Accuracy: 0.6832
Epoch [6/15]
Train Loss: 0.7749, Train Accuracy: 0.7537
Val Loss: 0.8586, Val Accuracy: 0.7026
Epoch [7/15]
Train Loss: 0.6743, Train Accuracy: 0.7734
Val Loss: 0.9495, Val Accuracy: 0.6918
Epoch [8/15]
Train Loss: 0.6033, Train Accuracy: 0.7893
Val Loss: 0.6890, Val Accuracy: 0.7694
Epoch [9/15]
Train Loss: 0.4670, Train Accuracy: 0.8627
Val Loss: 0.6503, Val Accuracy: 0.8319
Epoch [10/15]
Train Loss: 0.3597, Train Accuracy: 0.9118
Val Loss: 0.5620, Val Accuracy: 0.8448
Epoch [11/15]
Train Loss: 0.2947, Train Accuracy:

In [9]:
evaluate_test_set(model, test_loader, device)

Test Accuracy: 0.8793


In [10]:
torch.save(model.state_dict(), 'accent_recognition_model_state_dict.pth')