In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import torchaudio
import torchaudio.functional as F
from IPython.display import Audio
from pathlib import Path
import os
import random
import librosa.feature
from sklearn.model_selection import train_test_split

# Loading Data

In [2]:
data_dir = Path('data')
male_voices_path = data_dir / 'males'
female_voices_path = data_dir / 'females'

In [34]:
male_voices = list(male_voices_path.glob('*.wav'))
female_voices = list(female_voices_path.glob('*.wav'))
audio_paths = male_voices + female_voices
len(male_voices), len(female_voices), len(audio_paths)

(3682, 2311, 5993)

In [39]:
for i in range(len(audio_paths)):
    y, sr = librosa.load(audio_paths[i], sr=16000)
    z=librosa.get_duration(y=y)
print(f"Avg length of audio is {z:.3f} seconds")

Avg length of audio is 4.087 seconds


In [4]:
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
    if not isinstance(waveform, np.ndarray):
        waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    if not isinstance(waveform, np.ndarray):
        waveform = waveform.numpy()

    num_channels, _ = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)

In [40]:
rand_aud = random.sample(male_voices + female_voices, 10)
# sample_waveform, sample_rate = torchaudio.load(rand_aud[8], channels_first=True)
sample_waveform, sample_rate = librosa.load(rand_aud[0], sr=16000)
print(f"Sample rate: {sample_rate} | Sample waveform Shape: {sample_waveform.shape}")

if isinstance(sample_waveform, torch.Tensor):
    plot_waveform(sample_waveform, sample_rate, xlim=(-0.1, 3.2))
    plot_specgram(sample_waveform, sample_rate, xlim=(0, 3.04))

Sample rate: 16000 | Sample waveform Shape: (100352,)


In [41]:
print('Original')
Audio(sample_waveform, rate=sample_rate)

Original


In [33]:
print('FrequencyMasking')
transform_FrequencyMasking = torchaudio.transforms.FrequencyMasking(freq_mask_param=5)
freq_mask = transform_FrequencyMasking(sample_waveform)
Audio(freq_mask, rate=sample_rate)

FrequencyMasking


AttributeError: 'numpy.ndarray' object has no attribute 'dim'

In [8]:
transform_mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=40)
mfcc = transform_mfcc(sample_waveform)
print(f'Original Shape: {sample_waveform.shape}')
print(f'MFCC Shape: {mfcc.shape}')

Original Shape: torch.Size([1, 135168])
MFCC Shape: torch.Size([1, 40, 676])




# Dataset & Dataloader Preparation

In [47]:
len(sample_waveform), int(sr*round(z, 3))

(100352, 65391)

In [48]:
TEST_SIZE = 0.2
SAMPLE_RATE = 16000
AVG_AUDIO_LENGTH = int(SAMPLE_RATE*round(z, 3)) # z = 4.09..

def pad_or_trim(audio, length):
    if len(audio) < length:
        return np.pad(audio, (0, length - len(audio)), mode='constant')
    elif len(audio) > length:
        return audio[:length]
    else:
        return audio


audio_paths = male_voices + female_voices
train_data = random.sample(audio_paths, int(len(audio_paths) * (1 - TEST_SIZE)))
test_data = random.sample(audio_paths, int(len(audio_paths) * TEST_SIZE))

print(f'Train Data: {len(train_data)}')
print(f'Test Data: {len(test_data)}')

def get_class(audio_paths):
    classes = set([audio.parent.name for audio in audio_paths])
    class_to_idx = {class_name: idx for idx, class_name in enumerate(classes)}
    return classes, class_to_idx

class createDataset(Dataset):
    def __init__(self, data_paths, transform=None, sr=SAMPLE_RATE):
        self.paths = data_paths
        self.transform = transform
        self.sr = sr
        self.classes, self.class_to_idx = get_class(data_paths)
        
    def load_audio(self, idx:int):
        audio_path = self.paths[idx]
        waveform, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
        waveform = pad_or_trim(waveform, self.sr)
        return Audio(waveform, rate=sr), waveform, sr
    
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        audio_path = self.paths[idx]
        class_name = audio_path.parent.name
        class_idx = self.class_to_idx[class_name]
        _, waveform, sr = self.load_audio(idx)
        
        if self.transform is not None:
            return self.transform(y=waveform, sr=sr, n_mfcc=40), class_idx ## mfcc
        else:
            return audio_path, class_idx

transform = librosa.feature.mfcc # haha

train_dataset = createDataset(data_paths=train_data, transform=transform)
test_dataset = createDataset(data_paths=test_data, transform=transform)

Train Data: 4794
Test Data: 1198


In [50]:
# audio, wave, sr = train_dataset.load_audio(190)
# next(iter(train_dataset))
for data in iter(train_dataset):
    print(data[0].shape)
    break

(40, 32)


In [51]:
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True) 

In [53]:
wave, label = next(iter(train_loader))
wave.shape

torch.Size([64, 40, 32])

# Feature Selection

hahaha

# Model Building

In [56]:
from torch import nn
nn.Linear(in_features=32, out_features=128)(wave).shape

torch.Size([64, 40, 128])

In [92]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_V0 = torch.nn.Sequential(
    torch.nn.Linear(in_features=32, out_features=128),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    
    torch.nn.Linear(in_features=128, out_features=256),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    
    torch.nn.Linear(in_features=256, out_features=512),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    
    torch.nn.Flatten(),
    torch.nn.Linear(in_features=20480, out_features=1),
    torch.nn.ReLU()
).to(device)

model_V0(wave.to(device)).shape

torch.Size([64, 1])

In [103]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model_V0.parameters(), lr=0.001)

def accuracy(output: torch.Tensor, target: torch.Tensor):
    return (output == target).sum().item() / len(target)

In [104]:
def train(epochs,
          model: torch.nn.Module = model_V0,
          loss_fn: torch.nn.Module = loss_fn,
          optimizer: torch.optim.Optimizer = optimizer,
          train_loader: DataLoader = train_loader,
          test_loader: DataLoader = test_loader,
          device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),):
    
    model.train()
    for epoch in range(epochs):
        train_loss, train_acc = 0, 0
        for batch, (X, y) in enumerate(train_loader):
            X, y = X.to(device), y.to(device)
            
            y_pred = model(X).squeeze()
            loss = loss_fn(y_pred, y.float())
            
            y_pred_label = torch.softmax(y_pred, dim=0)
            acc = accuracy(y_pred_label, y)
            
            train_loss += loss.item()
            train_acc += acc
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}.. \n")
        print(f"Train Loss: {train_loss:.4f}.. Train Acc: {train_acc:.4f}")
    
    model.eval()
    test_loss, test_acc = 0, 0
    with torch.inference_mode():
        for batch, (X, y) in enumerate(test_loader):
            X, y = X.to(device), y.to(device)
            
            test_pred = model(X).squeeze()
            loss = loss_fn(test_pred, y.float())
            y_pred_label = torch.softmax(test_pred, dim=0)
            acc = accuracy(y_pred_label, y)
            test_loss += loss.item()
            test_acc += acc
        test_loss /= len(test_loader)
        test_acc /= len(test_loader)
        print(f"Test Loss: {test_loss:.4f}.. Test Acc: {test_acc:.4f}")
        
    
train(epochs=10, model=model_V0, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, test_loader=test_loader)

Epoch 1/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 2/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 3/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 4/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 5/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 6/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000


KeyboardInterrupt: 

Epoch 1/10.. 

Train Loss: 0.7055.. Train Acc: 0.0000
Epoch 2/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 3/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000
Epoch 4/10.. 

Train Loss: 0.6931.. Train Acc: 0.0000


KeyboardInterrupt: 