In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchaudio

In [None]:
# !unzip '/content/drive/MyDrive/accentDetection/Dataset.zip' -d '/content/Dataset'
!unzip '/content/drive/MyDrive/Dataset.zip' -d '/content/Dataset'

In [None]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import librosa
from torch.utils.data import random_split
from torch.optim import Adam
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
class AudioDataset(Dataset):
    def __init__(self, root_dir, data_transform=None, sample_duration=5):
        self.root_dir = root_dir
        self.data_transform = data_transform
        self.classes = sorted(os.listdir(root_dir))
        print(self.classes)
        self.sample_duration = sample_duration

        self.data = []
        self.labels = []
        for i, class_name in enumerate(self.classes):
            class_path = os.path.join(root_dir, class_name)
            for filename in os.listdir(class_path):
                filepath = os.path.join(class_path, filename)
                self.data.append(filepath)
                self.labels.append(i)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        waveform, _ = torchaudio.load(self.data[idx], normalize=True)

        # Resample to 16 kHz
        resample_transform = librosa.resample(y=waveform.numpy(), orig_sr=waveform.size(1), target_sr=16000)
        waveform = torch.tensor(resample_transform)

        # Trim or pad to ensure consistent length
        target_length = 16000 * self.sample_duration
        if waveform.size(1) > target_length:
            waveform = waveform[:, :target_length]
        else:
            padding = torch.zeros(1, target_length - waveform.size(1))
            waveform = torch.cat([waveform, padding], dim=1)

        # Convert PyTorch tensor to NumPy array
        waveform_np = waveform.numpy()

        # Extract MFCC features
        mfccs = self.extract_mfcc(waveform_np)

        # Apply data transformations if provided
        if self.data_transform:
            waveform_np = self.data_transform(waveform_np)
            mfccs = self.data_transform(mfccs)

        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return waveform_np, mfccs, label

    @staticmethod
    def extract_mfcc(audio, n_mfcc=13):
        mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=n_mfcc)
        return mfccs

In [None]:
class AccentModel(nn.Module):
    def __init__(self, num_classes):
        super(AccentModel, self).__init__()

        self.waveform_branch = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(2560000, 256),
            nn.ReLU()
        )

        self.mfcc_branch = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(64, 256),  # The input size is now the number of output channels of the last convolutional layer
            nn.ReLU()
        )

        # Correct the input size of the last linear layer
        self.classifier = nn.Sequential(
            nn.Linear(256 + 256, 256),  # Updated input size
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, waveform, mfcc):
        waveform_out = self.waveform_branch(waveform)
        mfcc_out = self.mfcc_branch(mfcc)

        # Concatenate along the correct dimension (dim=1)
        out = torch.cat((waveform_out, mfcc_out), dim=1)

        out = self.classifier(out)
        return out

In [None]:
# Assuming you have a validation dataset
dataset = AudioDataset(root_dir='/content/Dataset/Dataset')
train_size = int(0.7 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, eval_size], generator=torch.Generator().manual_seed(42))
# train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

['Arabic', 'Chinese', 'Hindi', 'Korean', 'Spanish', 'Vietnamese']


In [None]:
# Initialize the model, define loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 6
model = AccentModel(num_classes).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters())

In [None]:
from tqdm import tqdm

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    # Create a progress bar
    pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for i, (waveforms, mfccs, labels) in pbar:
        # Move data to the correct device
        waveforms = waveforms.to(device)
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        waveform_out = model.waveform_branch(waveforms)
        mfcc_out = model.mfcc_branch(mfccs)

        # print("Intermediate Shapes - Waveform Out:", waveform_out.shape, "MFCC Out:", mfcc_out.shape)

        # Concatenate
        out = torch.cat((waveform_out, mfcc_out), dim=1)
        out = model.classifier(out)
        loss = loss_fn(out, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        pbar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item()}')

    total_loss = 0
    total_correct = 0

    # Don't update parameters while evaluating
    with torch.no_grad():
        tbar = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
        for i, (waveforms, mfccs, labels) in tbar:
            # Move data to the correct device
            waveforms = waveforms.to(device)
            mfccs = mfccs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(waveforms, mfccs)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()

    # Compute average loss and accuracy
    avg_loss = total_loss / len(test_dataloader)
    avg_accuracy = total_correct / len(test_dataset)

    print(f'Test Loss: {avg_loss}, Test Accuracy: {avg_accuracy * 100:.2f}%')

    # Delete variables and empty cache
    with torch.no_grad():
      del waveforms, mfccs, labels, out
    torch.cuda.empty_cache()
# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

Epoch [1/10], Step [292/292], Loss: 0.40087899565696716: 100%|██████████| 292/292 [04:22<00:00,  1.11it/s]
100%|██████████| 125/125 [01:22<00:00,  1.52it/s]


Test Loss: 0.5023924363851547, Test Accuracy: 79.33%


Epoch [2/10], Step [292/292], Loss: 0.3450455367565155: 100%|██████████| 292/292 [04:20<00:00,  1.12it/s]
100%|██████████| 125/125 [01:20<00:00,  1.55it/s]


Test Loss: 0.545191270828247, Test Accuracy: 78.63%


Epoch [3/10], Step [292/292], Loss: 0.12581132352352142: 100%|██████████| 292/292 [04:17<00:00,  1.14it/s]
100%|██████████| 125/125 [01:19<00:00,  1.58it/s]


Test Loss: 0.55018679022789, Test Accuracy: 80.73%


Epoch [4/10], Step [292/292], Loss: 0.013590606860816479: 100%|██████████| 292/292 [04:15<00:00,  1.14it/s]
100%|██████████| 125/125 [01:17<00:00,  1.62it/s]


Test Loss: 0.5081525136977434, Test Accuracy: 86.04%


Epoch [5/10], Step [292/292], Loss: 0.00047270325012505054: 100%|██████████| 292/292 [04:17<00:00,  1.14it/s]
100%|██████████| 125/125 [01:19<00:00,  1.58it/s]


Test Loss: 0.5751621939763427, Test Accuracy: 86.14%


Epoch [6/10], Step [292/292], Loss: 1.6887920537556056e-06: 100%|██████████| 292/292 [04:19<00:00,  1.13it/s]
100%|██████████| 125/125 [01:19<00:00,  1.57it/s]


Test Loss: 0.7204368962366134, Test Accuracy: 85.34%


Epoch [7/10], Step [292/292], Loss: 0.444256454706192: 100%|██████████| 292/292 [04:19<00:00,  1.13it/s]
100%|██████████| 125/125 [01:20<00:00,  1.56it/s]


Test Loss: 0.825326890796423, Test Accuracy: 81.38%


Epoch [8/10], Step [292/292], Loss: 0.007077077869325876: 100%|██████████| 292/292 [04:19<00:00,  1.12it/s]
100%|██████████| 125/125 [01:24<00:00,  1.48it/s]


Test Loss: 0.8482066089101136, Test Accuracy: 83.08%


Epoch [9/10], Step [292/292], Loss: 4.150217864662409e-05: 100%|██████████| 292/292 [04:25<00:00,  1.10it/s]
100%|██████████| 125/125 [01:22<00:00,  1.52it/s]


Test Loss: 0.9860606040507555, Test Accuracy: 83.23%


Epoch [10/10], Step [292/292], Loss: 0.00014793315494898707: 100%|██████████| 292/292 [04:28<00:00,  1.09it/s]
100%|██████████| 125/125 [01:22<00:00,  1.52it/s]

Test Loss: 1.034877093865536, Test Accuracy: 83.38%





In [None]:
total_loss = 0
total_correct = 0
all_predictions = []
all_labels = []

# Don't update parameters while evaluating
with torch.no_grad():
    for i, (waveforms, mfccs, labels) in tqdm(enumerate(test_dataloader)):
        # Move data to the correct device
        waveforms = waveforms.to(device)
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(waveforms, mfccs)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

        # Collect predictions and labels for confusion matrix and ROC AUC score
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute average loss and accuracy
avg_loss = total_loss / len(test_dataloader)
avg_accuracy = total_correct / len(test_dataset)

# Confusion Matrix
conf_matrix = confusion_matrix(all_labels, all_predictions)
print("Confusion Matrix:")
print(conf_matrix)
print(f'Test Loss: {avg_loss}, Test Accuracy: {avg_accuracy * 100}%')

125it [01:21,  1.53it/s]

Confusion Matrix:
[[293   3   3   1   7  12]
 [  8 242   0   2  88  11]
 [ 26   1 298  12   2   1]
 [ 16   8   2 284   8  14]
 [ 17  60   2   0 220  12]
 [  7   1   1   2   5 329]]
Test Loss: 1.0354408485516906, Test Accuracy: 83.38338338338338%





In [None]:
# Save or use the trained model for inference
torch.save(model.state_dict(), "/content/drive/MyDrive/audio_classifier.pth")