# Gender classification with openSLR - 2D CNN
This notebook contains my (Hassan Hajarat) attempt in training the "Open Speech & Language Resources" dataset using a 2D convolutional neural network as an attempt to produce a gender classifier.<br>
Done with the help of: https://medium.com/@mikesmales/sound-classification-using-deep-learning-8bc2aa1990b7

In [1]:
import pandas as pd
import os
import librosa
import json
import numpy as np
from tqdm.contrib import tzip
from tqdm import tqdm
import torch
import time


In [2]:
PATH = os.getcwd() + "/raw-audio-gender-classification"


In [None]:
# Since I already indexed the data in the previous model build, I can import the indexes here
length = 48000 # First 3 seconds of each flac file (that's just how the files were saved for the first model)
train_subsets = ['train-clean-100']
test_subsets = ['dev-clean']

train_cached_id_to_filepath_location = PATH + '/data/LibriSpeech__datasetid_to_filepath__subsets={}__length={}.json'.format(
            train_subsets, length)

train_cached_id_to_sex_location = PATH + '/data/LibriSpeech__datasetid_to_sex__subsets={}__length={}.json'.format(
            train_subsets, length)

test_cached_id_to_filepath_location = PATH + '/data/LibriSpeech__datasetid_to_filepath__subsets={}__length={}.json'.format(
            test_subsets, length)

test_cached_id_to_sex_location = PATH + '/data/LibriSpeech__datasetid_to_sex__subsets={}__length={}.json'.format(
            test_subsets, length)

with open(train_cached_id_to_filepath_location) as f:
    train_datasetid_to_filepath = json.load(f)
    
with open(train_cached_id_to_sex_location) as f:
    train_datasetid_to_sex = json.load(f)

with open(test_cached_id_to_filepath_location) as f:
    test_datasetid_to_filepath = json.load(f)
    
with open(test_cached_id_to_sex_location) as f:
    test_datasetid_to_sex = json.load(f)


In [None]:
sex_to_label = {'M': False, 'F': True}


## Extract Features

In [None]:
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccsscaled = np.mean(mfccs.T,axis=0)
     
    return mfccsscaled


In [None]:
train_features = np.zeros((len(train_datasetid_to_filepath), 41)) # 40 features from the extract_features method + label

for i, zip_contents in enumerate(tzip(train_datasetid_to_filepath.items(), train_datasetid_to_sex.items())):
    filepath_row, sex_row = zip_contents
    _, filepath = filepath_row
    _, sex = sex_row
    train_features[i, 0:40] = extract_features(filepath)
    train_features[i, 40] = float(sex_to_label[sex])
    


In [None]:
test_features = np.zeros((len(test_datasetid_to_filepath), 41))

for i, zip_contents in enumerate(tzip(test_datasetid_to_filepath.items(), test_datasetid_to_sex.items())):
    filepath_row, sex_row = zip_contents
    _, filepath = filepath_row
    _, sex = sex_row
    test_features[i, 0:40] = extract_features(filepath)
    test_features[i, 40] = float(sex_to_label[sex])



In [None]:
# Save features for future use
np.save("train_mfcc.npy", train_features)
np.save("test_mfcc.npy", test_features)



In [3]:
# Load features incase they are already created to save time
train_features = np.load("train_mfcc.npy")
test_features = np.load("test_mfcc.npy")

## Create Dataset and DataLoader

In [4]:
class LibriSpeechMFCCDataset(torch.utils.data.Dataset):
    def __init__(self, features):
        self.n_samples = features.shape[0]
        self.X = torch.from_numpy(features[:, :-1].reshape(self.n_samples, 1, 4, 10))
        self.y = torch.from_numpy(features[:, -1])
        
        
    def __getitem__(self, index):
        return self.X[index], self.y[index]
        
    def __len__(self):
        return self.n_samples


In [5]:
from torch.utils.data import DataLoader
batchsize = 8



In [6]:
trainset = LibriSpeechMFCCDataset(train_features)
testset = LibriSpeechMFCCDataset(test_features)

trainloader = DataLoader(trainset, batch_size=batchsize, shuffle=True, drop_last=True)
testloader = DataLoader(testset, batch_size=batchsize, drop_last=True)

In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Create Network Structure and Initialize

In [8]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, 2, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 2, padding=1)
        self.conv3 = nn.Conv2d(32, 64, 2, padding=1)
        self.conv4 = nn.Conv2d(64, 128, 2, padding=1)
        
        x = torch.randn(4, 10).view(-1, 1, 4, 10)
        self._to_linear = None
        self.convs(x)
        
        self.output = nn.Linear(self._to_linear, 1)
        
    def convs(self, x):
        drop = nn.Dropout(0.2)
        # print(x.shape)
        x = self.conv1(x)
        # print(x.shape)
        x = F.relu(x)
        # print(x.shape)
        x = F.max_pool2d(x, (2, 2))
        # print(x.shape)
        x = drop(x)
        # print("---")
        
        x = self.conv2(x)
        # print(x.shape)
        x = F.relu(x)
        # print(x.shape)
        x = F.max_pool2d(x, (2, 2))
        # print(x.shape)
        x = drop(x)
        # print("---")
        
        x = self.conv3(x)
        # print(x.shape)
        x = F.relu(x)
        # print(x.shape)
        x = F.max_pool2d(x, (2, 2))
        # print(x.shape)
        x = drop(x)
        # print("---")
        
        x = self.conv4(x)
        # print(x.shape)
        x = F.relu(x)
        # print(x.shape)
        x = F.max_pool2d(x, (2, 2))
        # print(x.shape)
        x = drop(x)
        # print("---")

        if self._to_linear is None:
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
        # print(self._to_linear)
        
        return x
        
    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)
        x = torch.sigmoid(self.output(x))
        return x

net = Net()
net.double()

Net(
  (conv1): Conv2d(1, 16, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(64, 128, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
  (output): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
# Uncomment print statements above you get:
# torch.Size([1, 1, 4, 10])
# torch.Size([1, 16, 5, 11])
# torch.Size([1, 16, 5, 11])
# torch.Size([1, 16, 2, 5])
# ---
# torch.Size([1, 32, 3, 6])
# torch.Size([1, 32, 3, 6])
# torch.Size([1, 32, 1, 3])
# ---
# torch.Size([1, 64, 2, 4])
# torch.Size([1, 64, 2, 4])
# torch.Size([1, 64, 1, 2])
# ---
# torch.Size([1, 128, 2, 3])
# torch.Size([1, 128, 2, 3])
# torch.Size([1, 128, 1, 1])
# 128

In [9]:
####################
# Hyper-Parameters #
####################

learning_rate = 0.005
momentum = 0.9
n_epochs = 5
evaluate_every_n_batches = 800

criterion = nn.BCELoss() # Binary Cross Entropy
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)

In [10]:
def evaluate(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            batch, labels = data
            predicted = model(batch)
            total += labels.size(0)
            correct += ((predicted > 0.5)[:, 0] == labels.byte()).cpu().sum().numpy()

    return correct * 1.0 / total

In [None]:
best_accuracy = 0
val_acc_values = []
acc_values = []
t0 = time.time()

for epoch in range(n_epochs):
    running_loss = 0.0
    running_correct_samples = 0
    for i, data in enumerate(tqdm(trainloader), 0):
        inputs, labels = data
        optimizer.zero_grad()
        
        outputs = net.forward(inputs.double())
        loss = criterion(outputs, labels.reshape((batchsize, 1)).double())
        loss.backward()
        optimizer.step()
        
        # Evaluation and learning rate decay
        running_loss += loss.item()
        running_correct_samples += torch.eq((outputs[:, 0] > 0.5).cpu(), labels.byte()).numpy().sum()
        if i % evaluate_every_n_batches == evaluate_every_n_batches - 1:
            val_acc = evaluate(net, testloader)
            # return model to training mode
            net.train()
            print('[%d, %.1f] loss: %.3f acc: %.3f val_acc: %.3f' %
                  (epoch + 1, time.time() - t0,
                   running_loss / evaluate_every_n_batches,
                   running_correct_samples * 1. / (evaluate_every_n_batches * batchsize),
                   val_acc))
            running_loss = 0.0
            running_correct_samples = 0
            
            val_acc_values.append(val_acc)
            acc_values.append((running_correct_samples * 1. / (evaluate_every_n_batches * batchsize)))
            
            # Save new model if its the best
            if val_acc > best_accuracy:
                print('Saving new best model.')
                torch.save(net.state_dict(), PATH + '/models/' + 'model-' + str(time.time()))
                best_accuracy = val_acc

In [13]:
# Load and evaluate saved model
mymodel = Net()
mymodel.load_state_dict(torch.load(PATH + "/models/2d_conv_network/model-1599325173.818381", map_location=torch.device('cpu')))
mymodel.double()
evaluate(mymodel, testloader)

0.8932926829268293