# Gender Classification with openSLR - 1D CNN
This notebook contains my (Hassan Hajarat) attempt in training the "Open Speech & Language Resources" dataset using a 1D convolutional neural network as an attempt to produce a gender classifier.<br>
Data preprocessing/preparation/model evaluation was procured from: https://github.com/oscarknagg/raw-audio-gender-classification

In [1]:
import scipy
import numpy as np
import pandas as pd
import json
import os
import torch
import time
from tqdm import tqdm

In [2]:
os.chdir("raw-audio-gender-classification")

In [3]:
# data.py from https://github.com/oscarknagg/raw-audio-gender-classification/blob/master/data.py
import torch.utils.data
import soundfile as sf

sex_to_label = {'M': False, 'F': True}
label_to_sex = {False: 'M', True: 'F'}
PATH = os.getcwd()


class LibriSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, subsets, length, stochastic=True, cache=True):
        """
        This class subclasses the torch Dataset object. The __getitem__ function will return a raw audio sample and it's
        label.
        :param subsets: What LibriSpeech datasets to use
        :param length: Number of audio samples to take from each file. Any files shorter than this will be ignored.
        :param stochastic: If True then we will take a random fragment from each file of sufficient length. If False we
        wil always take a fragment starting at the beginning of a file.
        """
        self.subset = subsets
        self.fragment_length = length
        self.stochastic = stochastic

        print('Initialising LibriSpeechDataset with length = {} and subsets = {}'.format(length, subsets))

        # Convert subset to list if it is a string
        # This allows to handle list of multiple subsets the same a single subset
        if isinstance(subsets, str):
            subsets = [subsets]

        # Check if we have already indexed the files
        cached_id_to_filepath_location = '/data/LibriSpeech__datasetid_to_filepath__subsets={}__length={}.json'.format(
            subsets, length)
        cached_id_to_filepath_location = PATH + cached_id_to_filepath_location

        cached_id_to_sex_location = '/data/LibriSpeech__datasetid_to_sex__subsets={}__length={}.json'.format(
            subsets, length)
        cached_id_to_sex_location = PATH + cached_id_to_sex_location

        cached_dictionaries_exist = os.path.exists(cached_id_to_filepath_location) \
            and os.path.exists(cached_id_to_sex_location)
        if cache and cached_dictionaries_exist:
            print('Cached indexes found.')
            with open(cached_id_to_filepath_location) as f:
                self.datasetid_to_filepath = json.load(f)

            with open(cached_id_to_sex_location) as f:
                self.datasetid_to_sex = json.load(f)

            # The dictionaries loaded from json have string type keys
            # Convert them back to integers
            self.datasetid_to_filepath = {int(k): v for k, v in self.datasetid_to_filepath.items()}
            self.datasetid_to_sex = {int(k): v for k, v in self.datasetid_to_sex.items()}

            assert len(self.datasetid_to_filepath) == len(self.datasetid_to_sex), 'Cached indexes are different lengths!'

            self.n_files = len(self.datasetid_to_filepath)
            print('{} usable files found.'.format(self.n_files))

            return

        df = pd.read_csv(PATH+'/data/LibriSpeech/SPEAKERS.TXT', skiprows=11, delimiter='|', error_bad_lines=False)
        df.columns = [col.strip().replace(';', '').lower() for col in df.columns]
        df = df.assign(
            sex=df['sex'].apply(lambda x: x.strip()),
            subset=df['subset'].apply(lambda x: x.strip()),
            name=df['name'].apply(lambda x: x.strip()),
        )

        # Get id -> sex mapping
        librispeech_id_to_sex = df[df['subset'].isin(subsets)][['id', 'sex']].to_dict()
        self.librispeech_id_to_sex = {
            k: v for k, v in zip(librispeech_id_to_sex['id'].values(), librispeech_id_to_sex['sex'].values())}
        librispeech_id_to_name = df[df['subset'].isin(subsets)][['id', 'name']].to_dict()
        self.librispeech_id_to_name = {
            k: v for k, v in zip(librispeech_id_to_name['id'].values(), librispeech_id_to_name['name'].values())}

        datasetid = 0
        self.n_files = 0
        self.datasetid_to_filepath = {}
        self.datasetid_to_sex = {}
        self.datasetid_to_name = {}

        for s in subsets:
            print('Indexing {}...'.format(s))
            # Quick first pass to find total for tqdm bar
            subset_len = 0
            for root, folders, files in os.walk(PATH+'/data/LibriSpeech/{}/'.format(s)):
                subset_len += len([f for f in files if f.endswith('.flac')])

            progress_bar = tqdm(total=subset_len)
            for root, folders, files in os.walk(PATH+'/data/LibriSpeech/{}/'.format(s)):
                if len(files) == 0:
                    continue

                librispeech_id = int(root.split('/')[-2])

                for f in files:
                    # Skip non-sound files
                    if not f.endswith('.flac'):
                        continue

                    progress_bar.update(1)

                    # Skip short files
                    instance, samplerate = sf.read(os.path.join(root, f))
                    if len(instance) <= self.fragment_length:
                        continue

                    self.datasetid_to_filepath[datasetid] = os.path.abspath(os.path.join(root, f))
                    self.datasetid_to_sex[datasetid] = self.librispeech_id_to_sex[librispeech_id]
                    self.datasetid_to_name[datasetid] = self.librispeech_id_to_name[librispeech_id]
                    datasetid += 1
                    self.n_files += 1

            progress_bar.close()
        print('Finished indexing data. {} usable files found.'.format(self.n_files))

        # Save relevant dictionaries to json in order to re-use them layer
        # The indexing takes a few minutes each time and would be nice to just perform this calculation once
        with open(cached_id_to_filepath_location, 'w') as f:
            json.dump(self.datasetid_to_filepath, f)

        with open(cached_id_to_sex_location, 'w') as f:
            json.dump(self.datasetid_to_sex, f)

    def __getitem__(self, index):
        instance, samplerate = sf.read(self.datasetid_to_filepath[index])
        # Choose a random sample of the file
        if self.stochastic:
            fragment_start_index = np.random.randint(0, len(instance)-self.fragment_length)
        else:
            fragment_start_index = 0
        instance = instance[fragment_start_index:fragment_start_index+self.fragment_length]
        sex = self.datasetid_to_sex[index]
        return instance, sex_to_label[sex]

    def __len__(self):
        return self.n_files


In [4]:
from torch.utils.data import DataLoader
LIBRISPEECH_SAMPLING_RATE = 16000

In [5]:
####################
# Hyper-Parameters #
####################

n_seconds = 3 # We want the first 3 seconds of each recording only
downsampling = 4 # 4 times downsampled data (4000 point each seconds instead of 16000)
batchsize = 8
training_set = ['train-clean-100']
validation_set = 'dev-clean'
learning_rate = 0.005
n_epochs = 7 # More than enough

# Extra params
momentum = 0.9
evaluate_every_n_batches = 800
reduce_lr_patience = 32

In [6]:
##########################################
# Create datasets/ Access cached indexes #
##########################################

trainset = LibriSpeechDataset(training_set, int(LIBRISPEECH_SAMPLING_RATE * n_seconds))
testset = LibriSpeechDataset(validation_set, int(LIBRISPEECH_SAMPLING_RATE * n_seconds), stochastic=False)
trainloader = DataLoader(trainset, batch_size=batchsize, shuffle=True, drop_last=True)
testloader = DataLoader(testset, batch_size=batchsize, drop_last=True)

Initialising LibriSpeechDataset with length = 48000 and subsets = ['train-clean-100']
Cached indexes found.
27949 usable files found.
Initialising LibriSpeechDataset with length = 48000 and subsets = dev-clean
Cached indexes found.
2303 usable files found.


In [7]:
import torch.nn as nn
import torch.nn.functional as F

In [8]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 64, 3, dilation=1, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        
        self.conv2 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        
        self.conv3 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn3 = nn.BatchNorm1d(64)
        
        self.conv4 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn4 = nn.BatchNorm1d(64)
        
        self.conv5 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn5 = nn.BatchNorm1d(64)
        
        self.finalconv = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        
        self.output = nn.Linear(64, 1) # True or false value
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.max_pool1d(self.bn2(F.relu(self.conv2(x))), kernel_size=3, stride=3)
        x = F.max_pool1d(self.bn3(F.relu(self.conv3(x))), kernel_size=3, stride=3)
        x = F.max_pool1d(self.bn4(F.relu(self.conv4(x))), kernel_size=3, stride=3)
        x = F.max_pool1d(self.bn5(F.relu(self.conv5(x))), kernel_size=3, stride=3)
        x = F.max_pool1d(F.relu(self.finalconv(x)), kernel_size=x.size()[2:])
        x = x.view(-1, 64)
        x = torch.sigmoid(self.output(x))
        return x

net = Net()
net.double()

Net(
  (conv1): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (finalconv): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (output): Linear(in_features=64, out_features=1, bias=True)
)

In [9]:
#############################
# Define loss and optimiser #
#############################
import torch.optim as optim
criterion = nn.BCELoss() # Binary Cross Entropy
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)

# This scheduler reduces lr on command
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=reduce_lr_patience)

In [10]:
from utils import whiten, evaluate
from scipy.signal import resample

In [11]:
# Processing method for each batch when needed
def preprocessor(batch):
    batch = whiten(batch)
    batch = torch.from_numpy(
        resample(batch, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1)
    ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling)))
    return batch

In [None]:
best_accuracy = 0
val_acc_values = []
acc_values = []
t0 = time.time()

for epoch in range(n_epochs):
    running_loss = 0.0
    running_correct_samples = 0
    for i, data in enumerate(tqdm(trainloader, position=0), 0):
        inputs, labels = data
        # Normalise the volume to a fixed root mean square value as some speakers are much quieter than others
        inputs = whiten(inputs)
        # Resample audio
        inputs = torch.from_numpy(
            resample(inputs, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling), axis=1)
        ).reshape((batchsize, 1, int(LIBRISPEECH_SAMPLING_RATE * n_seconds / downsampling))).double()
        # Zero the parameter gradients
        optimizer.zero_grad()
        # Forward + backward + optimize
        outputs = net.forward(inputs)
        loss = criterion(outputs, labels.reshape((batchsize, 1)).double())
        loss.backward()
        optimizer.step()
        
        # Evaluation and learning rate decay
        running_loss += loss.item()
        running_correct_samples += torch.eq((outputs[:, 0] > 0.5).cpu(), labels.byte()).numpy().sum()
        if i % evaluate_every_n_batches == evaluate_every_n_batches - 1:
            val_acc = evaluate(net, testloader, preprocessor)
            # return model to training mode
            net.train()
            print('[%d, %.1f] loss: %.3f acc: %.3f val_acc: %.3f' %
                  (epoch + 1, time.time() - t0,
                   running_loss / evaluate_every_n_batches,
                   running_correct_samples * 1. / (evaluate_every_n_batches * batchsize),
                   val_acc))
            running_loss = 0.0
            running_correct_samples = 0
            
            val_acc_values.append(val_acc)
            acc_values.append((running_correct_samples * 1. / (evaluate_every_n_batches * batchsize)))
            
            # Save new model if its the best
            if val_acc > best_accuracy:
                print('Saving new best model.')
                torch.save(net.state_dict(), PATH + '/models/' + 'model-' + str(time.time()))
                best_accuracy = val_acc
            
            # Check for plateau (reduce lr if so)
            scheduler.step(val_acc)
            
    
print('\nFinished Training')
print('Best validation accuracy was {:.3f}'.format(best_accuracy))

In [None]:
# Save model
torch.save(net.state_dict(), PATH + '/models/' + 'model-' + str(time.time()))

In [12]:
# Evaluate my good model
downsampling = 4

mymodel = Net()
mymodel.load_state_dict(torch.load("models/Second_conv_network/model-1599233399.234077", map_location=torch.device('cpu')))
mymodel.double()
evaluate(mymodel, testloader, preprocessor)


  0%|          | 0/287 [00:00<?, ?it/s][A
  0%|          | 1/287 [00:00<02:19,  2.05it/s][A
  1%|          | 2/287 [00:00<02:11,  2.16it/s][A
  1%|          | 3/287 [00:01<02:08,  2.22it/s][A
  1%|▏         | 4/287 [00:01<02:15,  2.09it/s][A
  2%|▏         | 5/287 [00:02<02:22,  1.97it/s][A
  2%|▏         | 6/287 [00:02<02:14,  2.09it/s][A
  2%|▏         | 7/287 [00:03<02:08,  2.19it/s][A
  3%|▎         | 8/287 [00:03<02:03,  2.26it/s][A
  3%|▎         | 9/287 [00:04<02:02,  2.27it/s][A
  3%|▎         | 10/287 [00:04<02:00,  2.29it/s][A
  4%|▍         | 11/287 [00:04<01:58,  2.32it/s][A
  4%|▍         | 12/287 [00:05<01:56,  2.36it/s][A
  5%|▍         | 13/287 [00:05<01:53,  2.42it/s][A
  5%|▍         | 14/287 [00:06<01:52,  2.42it/s][A
  5%|▌         | 15/287 [00:06<01:51,  2.43it/s][A
  6%|▌         | 16/287 [00:07<01:59,  2.26it/s][A
  6%|▌         | 17/287 [00:07<02:07,  2.12it/s][A
  6%|▋         | 18/287 [00:08<02:03,  2.17it/s][A
  7%|▋         | 19/287 [00:0

 54%|█████▍    | 156/287 [01:06<00:56,  2.32it/s][A
 55%|█████▍    | 157/287 [01:07<00:57,  2.28it/s][A
 55%|█████▌    | 158/287 [01:07<00:55,  2.33it/s][A
 55%|█████▌    | 159/287 [01:07<00:54,  2.36it/s][A
 56%|█████▌    | 160/287 [01:08<00:53,  2.39it/s][A
 56%|█████▌    | 161/287 [01:08<00:52,  2.40it/s][A
 56%|█████▋    | 162/287 [01:09<00:51,  2.43it/s][A
 57%|█████▋    | 163/287 [01:09<00:51,  2.42it/s][A
 57%|█████▋    | 164/287 [01:09<00:50,  2.41it/s][A
 57%|█████▋    | 165/287 [01:10<00:50,  2.39it/s][A
 58%|█████▊    | 166/287 [01:10<00:50,  2.39it/s][A
 58%|█████▊    | 167/287 [01:11<00:50,  2.38it/s][A
 59%|█████▊    | 168/287 [01:11<00:49,  2.40it/s][A
 59%|█████▉    | 169/287 [01:12<00:49,  2.40it/s][A
 59%|█████▉    | 170/287 [01:12<00:48,  2.41it/s][A
 60%|█████▉    | 171/287 [01:12<00:48,  2.42it/s][A
 60%|█████▉    | 172/287 [01:13<00:47,  2.41it/s][A
 60%|██████    | 173/287 [01:13<00:47,  2.39it/s][A
 61%|██████    | 174/287 [01:14<00:47,  2.39it

0.9786585365853658

In [None]:
# Old unsuccesful network (messed up with the kernel size for the convolutions)
class FirstNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 64, 3, dilation=1, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        
        self.conv2 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        
        self.conv3 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn3 = nn.BatchNorm1d(64)
        
        self.conv4 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn4 = nn.BatchNorm1d(64)
        
        self.conv5 = nn.Conv1d(64, 64, 3, dilation=1, padding=1)
        self.bn5 = nn.BatchNorm1d(64)
        
        self.output = nn.Linear(64, 1) # True or false value
        
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.max_pool1d(self.bn2(F.relu(self.conv2(x))), kernel_size=x.size()[2:]) # kernel_size is 1 here
        x = F.max_pool1d(self.bn3(F.relu(self.conv3(x))), kernel_size=x.size()[2:]) # kernel_size is 1 here
        x = F.max_pool1d(self.bn4(F.relu(self.conv4(x))), kernel_size=x.size()[2:]) # kernel_size is 1 here
        x = F.max_pool1d(self.bn5(F.relu(self.conv5(x))), kernel_size=x.size()[2:])
        x = x.view(-1, 64)
        x = torch.sigmoid(self.output(x))
        return x

downsampling = 1
myoldmodel = FirstNet()
myoldmodel.load_state_dict(torch.load("models/First_conv_network/model-1599203156.34351", map_location=torch.device('cpu')))
myoldmodel.double()

In [13]:
# Compare to model created by original user
from models import ConvNet
downsampling = 1
competitivemodel = ConvNet(64, 7)
competitivemodel.load_state_dict(torch.load("models/original/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch", map_location=torch.device('cpu')))
competitivemodel.double()
evaluate(competitivemodel, testloader, preprocessor)



  0%|          | 1/287 [00:01<09:09,  1.92s/it][A
  1%|          | 2/287 [00:03<08:46,  1.85s/it][A
  1%|          | 3/287 [00:05<08:30,  1.80s/it][A
  1%|▏         | 4/287 [00:06<08:18,  1.76s/it][A
  2%|▏         | 5/287 [00:08<08:09,  1.73s/it][A
  2%|▏         | 6/287 [00:10<08:04,  1.72s/it][A
  2%|▏         | 7/287 [00:11<07:56,  1.70s/it][A
  3%|▎         | 8/287 [00:13<07:52,  1.69s/it][A
  3%|▎         | 9/287 [00:15<07:49,  1.69s/it][A
  3%|▎         | 10/287 [00:16<07:45,  1.68s/it][A
  4%|▍         | 11/287 [00:18<07:43,  1.68s/it][A
  4%|▍         | 12/287 [00:20<07:39,  1.67s/it][A
  5%|▍         | 13/287 [00:21<07:36,  1.67s/it][A
  5%|▍         | 14/287 [00:23<07:33,  1.66s/it][A
  5%|▌         | 15/287 [00:25<07:32,  1.66s/it][A
  6%|▌         | 16/287 [00:26<07:30,  1.66s/it][A
  6%|▌         | 17/287 [00:28<07:28,  1.66s/it][A
  6%|▋         | 18/287 [00:30<07:26,  1.66s/it][A
  7%|▋         | 19/287 [00:31<07:24,  1.66s/it][A
  7%|▋         | 20

 53%|█████▎    | 151/287 [04:14<03:51,  1.70s/it][A
 53%|█████▎    | 152/287 [04:15<03:51,  1.71s/it][A
 53%|█████▎    | 153/287 [04:17<03:47,  1.70s/it][A
 54%|█████▎    | 154/287 [04:19<03:44,  1.69s/it][A
 54%|█████▍    | 155/287 [04:20<03:41,  1.68s/it][A
 54%|█████▍    | 156/287 [04:22<03:40,  1.68s/it][A
 55%|█████▍    | 157/287 [04:24<03:38,  1.68s/it][A
 55%|█████▌    | 158/287 [04:25<03:36,  1.68s/it][A
 55%|█████▌    | 159/287 [04:27<03:34,  1.68s/it][A
 56%|█████▌    | 160/287 [04:29<03:34,  1.69s/it][A
 56%|█████▌    | 161/287 [04:31<03:33,  1.70s/it][A
 56%|█████▋    | 162/287 [04:32<03:31,  1.69s/it][A
 57%|█████▋    | 163/287 [04:34<03:30,  1.69s/it][A
 57%|█████▋    | 164/287 [04:36<03:27,  1.69s/it][A
 57%|█████▋    | 165/287 [04:37<03:25,  1.69s/it][A
 58%|█████▊    | 166/287 [04:39<03:23,  1.68s/it][A
 58%|█████▊    | 167/287 [04:41<03:21,  1.68s/it][A
 59%|█████▊    | 168/287 [04:42<03:19,  1.68s/it][A
 59%|█████▉    | 169/287 [04:44<03:17,  1.67s/

0.9878048780487805