In [1]:
import torch
import json
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from scipy.signal import resample

In [2]:
import sys
sys.path.append('../')
from raw_audio_gender_classification.config import PATH, LIBRISPEECH_SAMPLING_RATE
from raw_audio_gender_classification.data import LibriSpeechDataset, label_to_sex
from raw_audio_gender_classification.models import DilatedNet, ConvNet
import raw_audio_gender_classification.utils #import whiten

ImportError: cannot import name 'PATH' from 'config' (e:\eurecom\project\git_deposit\venv\lib\site-packages\config\__init__.py)

**TqdmDeprecationWarning**:This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

In [3]:
def whiten(batch, rms=0.038021):
    """This function whitens a batch so each sample has 0 mean and the same root mean square amplitude i.e. volume."""
    # Subtract mean
    sample_wise_mean = batch.mean(dim=1)
    whitened_batch = batch-sample_wise_mean.repeat([batch.shape[1], 1]).transpose(dim0=1, dim1=0)

    # Divide through
    rescaling_factor = rms/ torch.sqrt(torch.mul(batch, batch).mean(dim=1))
    whitened_batch = whitened_batch*rescaling_factor.repeat([batch.shape[1], 1]).transpose(dim0=1, dim1=0)
    return whitened_batch

### Load model

In [4]:
model_path = PATH + '/models/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch'

In [5]:
model_type = model_path.split('/')[-1].split('__')[0]
model_name = model_path.split('/')[-1].split('.')[0]
model_params = {i.split('=')[0]: int(i.split('=')[1]) for i in model_name.split('__')[1:]}

# Here we assume that the model was trained on the LibriSpeech dataset
model_sampling_rate = LIBRISPEECH_SAMPLING_RATE/model_params['downsampling']
model_num_samples = model_params['n_seconds']*model_sampling_rate

print('Model parameters determined from filename:')
print(json.dumps(model_params, indent=4))

if model_type == 'max_pooling':
    model = ConvNet(model_params['n_filters'], model_params['n_layers'])
elif model_type == 'dilated':
    model = DilatedNet(model_params['n_filters'], model_params['n_depth'], model_params['n_stacks'])
else:
    raise(ValueError, 'Model type not recognised.')

model.load_state_dict(torch.load(model_path))
model.double()
model.cuda()
model.eval()

Model parameters determined from filename:
{
    "n_layers": 7,
    "n_filters": 64,
    "downsampling": 1,
    "n_seconds": 3
}


NameError: name 'ConvNet' is not defined

### Generate predictions

In [6]:
testset = LibriSpeechDataset('dev-clean',LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds'],stochastic=False,cache=False)
testloader = torch.utils.data.DataLoader(testset,batch_size=16,num_workers=4)

NameError: name 'LibriSpeechDataset' is not defined

In [17]:
df = []
for i in tqdm(range(len(testset))):
    instance, label = testset[i]
    instance = whiten(torch.from_numpy(instance[np.newaxis,:]))
        
     # New resampling
    instance_cuda = torch.from_numpy(
        resample(
            instance,
            int(LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds']/model_params['downsampling']),
            axis=1
        )
    ).reshape((1,1,int(LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds']/model_params['downsampling'])))
    
    with torch.no_grad():
        pred = model(instance_cuda)[0][0].cpu().numpy()
        
    df.append({
        'i': i,
        'name': testset.datasetid_to_name[i],
        'sex': label_to_sex[label],
        'rms': np.sqrt(np.square(instance)).mean(),
        'rmedians': np.median(np.sqrt(np.square(instance))),
        'mean': instance.mean(),
        'pred':pred,
        'label': label
    })
df = pd.DataFrame(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2303.0), HTML(value='')))






In [18]:
df = df.assign(
    error=abs(df['pred'].astype(float)-df['label'].astype(int)),
    label=df['label'].astype(int),
    correct=(df['pred'] > 0.5) == df['label'],
    pred=df['pred'].astype(float)
)

In [19]:
gb = df.groupby('name').agg({'error': ['mean','max'], 'pred': 'mean', 'label': 'mean'})
gb.columns = ['.'.join(col).strip() for col in gb.columns.values]
gb.sort_values('error.mean',ascending=False)

Unnamed: 0_level_0,error.mean,error.max,pred.mean,label.mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JenniferRutters,0.973986,0.999665,0.973986,0
Kathy Caver,0.68646,0.991581,0.68646,0
dexter,0.234752,0.780647,0.234752,0
President Lethe,0.144391,0.948249,0.144391,0
Jennifer Wiginton,0.111627,0.381209,0.888373,1
Nicodemus,0.108616,0.703835,0.108616,0
Stephen Kinford,0.106456,0.882708,0.106456,0
Peter Eastman,0.079562,0.785736,0.079562,0
Mark Nelson,0.056028,0.967619,0.056028,0
VOICEGUY,0.036067,0.500753,0.036067,0


In [22]:
print('{} out of {} ({}%) of speakers in the validation set are never misclassified.'.format(
    len(gb[gb['error.max']<0.5]),
    len(gb),
    len(gb[gb['error.max']<0.5])*100./len(gb)
))

31 out of 40 (77.5%) of speakers in the validation set are never misclassified.
