## Basic imports

In [1]:
import sys 
import os
import numpy as np 
import matplotlib.pyplot as plt
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

print("Python: %s" % sys.version)
print("Pytorch: %s" % torch.__version__)

# determine device to run network on (runs on gpu if available)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#audioviz
import librosa as libr
import librosa.display as display
import IPython.display

import pandas as pd

Python: 3.6.5 (default, Jul  6 2018, 19:12:46) 
[GCC 5.4.0 20160609]
Pytorch: 0.4.0


## Hyperparameters

In [2]:
n_seconds = 3
n_epochs = 50
sampling_rate = 16000
number_of_mels =128
all_data = ['train-clean-100']
lr = 0.001

## Speech preprocessing
Buidling tensorToMFCC transformation for learning

In [3]:
class tensorToMFCC:
    def __call__(self, y):
#         y = y.numpy()
        dims = y.shape
        y = libr.feature.melspectrogram(np.reshape(y, (dims[1],)), 16000, n_mels=number_of_mels,
                               fmax=8000)
        y = libr.feature.mfcc(S = libr.power_to_db(y))
        y = torch.from_numpy(y)                           
        return y.float()

In [4]:
transform  = tensorToMFCC()

## LibriSpeechDataSet
Load personalized data set, inspred by this [repository](https://github.com/oscarknagg/voicemap/tree/pytorch-python-3.6)

In [5]:
%load_ext autoreload
%autoreload 2
sys.path.insert(0, './../../Utils')
from datasets import LibriSpeechDataset
from datasets import Libri_preload_and_split

In [6]:
path = 'data/'

splits = [0.8, 0.2] #input fraction of data you want partitioned
attacking = True

if sum(splits) != 1:
    print('error: splits do not sum to 1.')

#Splits data into 2 sets of speakers for target & shadow network, into above defined train:test splits
dfs = Libri_preload_and_split(path,all_data,n_seconds,pad=False,cache=True,splits=splits, attacking = attacking)    

#target train & test
valid_sequence_train = LibriSpeechDataset(path, df = dfs[0], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

valid_sequence_test = LibriSpeechDataset(path, df = dfs[1], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

#shadow train & test
valid_sequence_train_shadow = LibriSpeechDataset(path, df = dfs[2], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

valid_sequence_test_shadow = LibriSpeechDataset(path, df = dfs[3], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

Initialising LibriSpeechDataset with minimum length = 3s and subsets = ['train-clean-100']
Finished indexing data. 27949 usable files found.
Finished splitting data.


In [7]:
# Loaders for data for target model & shadow model 
train_loader = DataLoader(valid_sequence_train,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )

train_loader_shadow = DataLoader(valid_sequence_train_shadow,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )

test_loader = DataLoader(valid_sequence_test,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )

test_loader_shadow = DataLoader(valid_sequence_test_shadow,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )


In [8]:
recording, speaker  = iter(train_loader).next()

In [9]:
print(recording.shape)
print(valid_sequence_train.num_speakers)

torch.Size([32, 20, 94])
11119


## Cyphercat utilities

In [10]:
sys.path.insert(0,'../../Utils/')
from train import *
from metrics import * 
import models
from data_downloaders import * 

cuda:0


## Models

In [11]:
class ConvBlock(nn.Module):
    def __init__(self, n_input, n_out, kernel_size):
        super(ConvBlock, self).__init__()
        self.cnn_block = nn.Sequential(
            nn.Conv1d(n_input, n_out, kernel_size, padding=1),
            nn.BatchNorm1d(n_out),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4)
        )
    
    def forward(self, x):
        return self.cnn_block(x)


class CNN_classifier(nn.Module):
    def __init__(self, in_size, n_hidden, n_classes):
        super(CNN_classifier, self).__init__()
        self.down_path = nn.ModuleList()
        self.down_path.append(ConvBlock(in_size, 2*in_size, 3))
        self.down_path.append(ConvBlock(2*in_size, 4*in_size, 3))
        self.down_path.append(ConvBlock(4*in_size, 8*in_size, 3))
        self.fc = nn.Sequential(
            nn.Linear(8*in_size, n_hidden),
            nn.ReLU()
        )
        self.out = nn.Linear(n_hidden, n_classes)
    def forward(self, x):
        for down in self.down_path:
            x = down(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.out(x)
        

In [12]:
test = ConvBlock(20, 40, 3)
aa = test(recording)
print(aa.shape)

torch.Size([32, 40, 23])


In [13]:
classifier = CNN_classifier(20, 512, 251)
classifier.apply(models.weights_init)
classifier.to(device)

CNN_classifier(
  (down_path): ModuleList(
    (0): ConvBlock(
      (cnn_block): Sequential(
        (0): Conv1d(20, 40, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      )
    )
    (1): ConvBlock(
      (cnn_block): Sequential(
        (0): Conv1d(40, 80, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      )
    )
    (2): ConvBlock(
      (cnn_block): Sequential(
        (0): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): MaxPool1d(kernel_size=4, st

In [14]:
test = classifier(recording.to(device))
print(test.shape)

torch.Size([32, 251])


In [15]:
optimizer = optim.Adam(classifier.parameters(), lr)
criterion = nn.CrossEntropyLoss()

In [None]:
train(classifier, train_loader, test_loader, optimizer, criterion, 50, verbose = True)

[0/50][0/348] loss = 5.586255
[0/50][1/348] loss = 5.478024
[0/50][2/348] loss = 5.595171
[0/50][3/348] loss = 5.578467
[0/50][4/348] loss = 5.228079
[0/50][5/348] loss = 5.350735
[0/50][6/348] loss = 5.235791
[0/50][7/348] loss = 5.285726
[0/50][8/348] loss = 5.284833
[0/50][9/348] loss = 4.950645
[0/50][10/348] loss = 5.233456
[0/50][11/348] loss = 5.076006
[0/50][12/348] loss = 5.140486
[0/50][13/348] loss = 5.047959
[0/50][14/348] loss = 5.209031
[0/50][15/348] loss = 5.046748
[0/50][16/348] loss = 4.765276
[0/50][17/348] loss = 4.769247
[0/50][18/348] loss = 4.976702
[0/50][19/348] loss = 4.817549
[0/50][20/348] loss = 4.690091
[0/50][21/348] loss = 4.933263
[0/50][22/348] loss = 4.714959
[0/50][23/348] loss = 4.430823
[0/50][24/348] loss = 4.860175
[0/50][25/348] loss = 4.775699
[0/50][26/348] loss = 4.612775
[0/50][27/348] loss = 4.510980
[0/50][28/348] loss = 4.345355
[0/50][29/348] loss = 4.629667
[0/50][30/348] loss = 4.524089
[0/50][31/348] loss = 4.336950
[0/50][32/348] los

[0/50][262/348] loss = 1.120486
[0/50][263/348] loss = 0.738738
[0/50][264/348] loss = 0.941837
[0/50][265/348] loss = 0.966072
[0/50][266/348] loss = 1.086259
[0/50][267/348] loss = 1.405377
[0/50][268/348] loss = 0.863021
[0/50][269/348] loss = 1.051419
[0/50][270/348] loss = 0.687109
[0/50][271/348] loss = 1.040060
[0/50][272/348] loss = 0.728692
[0/50][273/348] loss = 0.778808
[0/50][274/348] loss = 1.083838
[0/50][275/348] loss = 0.930679
[0/50][276/348] loss = 1.123107
[0/50][277/348] loss = 1.136647
[0/50][278/348] loss = 0.811882
[0/50][279/348] loss = 0.885093
[0/50][280/348] loss = 0.934221
[0/50][281/348] loss = 0.772202
[0/50][282/348] loss = 1.088836
[0/50][283/348] loss = 0.713348
[0/50][284/348] loss = 1.146482
[0/50][285/348] loss = 0.903576
[0/50][286/348] loss = 0.907262
[0/50][287/348] loss = 0.818058
[0/50][288/348] loss = 0.596741
[0/50][289/348] loss = 0.736157
[0/50][290/348] loss = 0.762501
[0/50][291/348] loss = 0.962372
[0/50][292/348] loss = 0.717012
[0/50][2

[1/50][176/348] loss = 0.734603
[1/50][177/348] loss = 0.472056
[1/50][178/348] loss = 0.154414
[1/50][179/348] loss = 0.386338
[1/50][180/348] loss = 0.330879
[1/50][181/348] loss = 0.605994
[1/50][182/348] loss = 0.347330
[1/50][183/348] loss = 0.334586
[1/50][184/348] loss = 0.513511
[1/50][185/348] loss = 0.994189
[1/50][186/348] loss = 0.444179
[1/50][187/348] loss = 0.769594
[1/50][188/348] loss = 0.536125
[1/50][189/348] loss = 0.473065
[1/50][190/348] loss = 0.218396
[1/50][191/348] loss = 0.358592
[1/50][192/348] loss = 0.229709
[1/50][193/348] loss = 0.490615
[1/50][194/348] loss = 0.519312
[1/50][195/348] loss = 0.358689
[1/50][196/348] loss = 0.219078
[1/50][197/348] loss = 0.435008
[1/50][198/348] loss = 0.376231
[1/50][199/348] loss = 0.738716
[1/50][200/348] loss = 0.409248
[1/50][201/348] loss = 0.649930
[1/50][202/348] loss = 0.159860
[1/50][203/348] loss = 0.546454
[1/50][204/348] loss = 0.487226
[1/50][205/348] loss = 0.241960
[1/50][206/348] loss = 0.491554
[1/50][2

[2/50][88/348] loss = 0.143684
[2/50][89/348] loss = 0.123254
[2/50][90/348] loss = 0.131277
[2/50][91/348] loss = 0.562762
[2/50][92/348] loss = 0.259545
[2/50][93/348] loss = 0.288579
[2/50][94/348] loss = 0.254491
[2/50][95/348] loss = 0.169820
[2/50][96/348] loss = 0.270001
[2/50][97/348] loss = 0.210294
[2/50][98/348] loss = 0.300122
[2/50][99/348] loss = 0.382144
[2/50][100/348] loss = 0.241360
[2/50][101/348] loss = 0.132424
[2/50][102/348] loss = 0.233596
[2/50][103/348] loss = 0.236835
[2/50][104/348] loss = 0.276956
[2/50][105/348] loss = 0.167178
[2/50][106/348] loss = 0.287888
[2/50][107/348] loss = 0.231881
[2/50][108/348] loss = 0.385026
[2/50][109/348] loss = 0.109140
[2/50][110/348] loss = 0.422172
[2/50][111/348] loss = 0.497567
[2/50][112/348] loss = 0.247833
[2/50][113/348] loss = 0.374779
[2/50][114/348] loss = 0.370563
[2/50][115/348] loss = 0.374974
[2/50][116/348] loss = 0.196394
[2/50][117/348] loss = 0.206517
[2/50][118/348] loss = 0.204667
[2/50][119/348] loss

[2/50][346/348] loss = 0.385855
[2/50][347/348] loss = 0.161369
[2/50]
Training:

Accuracy = 95.72 %


Test:

Accuracy = 83.50 %


[3/50][0/348] loss = 0.159700
[3/50][1/348] loss = 0.203337
[3/50][2/348] loss = 0.097027
[3/50][3/348] loss = 0.320974
[3/50][4/348] loss = 0.248194
[3/50][5/348] loss = 0.143427
[3/50][6/348] loss = 0.070934
[3/50][7/348] loss = 0.175883
[3/50][8/348] loss = 0.176306
[3/50][9/348] loss = 0.253252
[3/50][10/348] loss = 0.298453
[3/50][11/348] loss = 0.319702
[3/50][12/348] loss = 0.298966
[3/50][13/348] loss = 0.208458
[3/50][14/348] loss = 0.305464
[3/50][15/348] loss = 0.120906
[3/50][16/348] loss = 0.287404
[3/50][17/348] loss = 0.173400
[3/50][18/348] loss = 0.146926
[3/50][19/348] loss = 0.252341
[3/50][20/348] loss = 0.229045
[3/50][21/348] loss = 0.070559
[3/50][22/348] loss = 0.075126
[3/50][23/348] loss = 0.188714
[3/50][24/348] loss = 0.217723
[3/50][25/348] loss = 0.228501
[3/50][26/348] loss = 0.151244
[3/50][27/348] loss = 0.343323
[3/50][28/3

[3/50][256/348] loss = 0.122270
[3/50][257/348] loss = 0.156222
[3/50][258/348] loss = 0.171060
[3/50][259/348] loss = 0.421910
[3/50][260/348] loss = 0.196088
[3/50][261/348] loss = 0.321036
[3/50][262/348] loss = 0.119613
[3/50][263/348] loss = 0.159725
[3/50][264/348] loss = 0.363452
[3/50][265/348] loss = 0.125471
[3/50][266/348] loss = 0.040421
[3/50][267/348] loss = 0.125994
[3/50][268/348] loss = 0.167488
[3/50][269/348] loss = 0.314458
[3/50][270/348] loss = 0.180180
[3/50][271/348] loss = 0.132160
[3/50][272/348] loss = 0.035607
[3/50][273/348] loss = 0.214710
[3/50][274/348] loss = 0.066033
[3/50][275/348] loss = 0.122631
[3/50][276/348] loss = 0.483664
[3/50][277/348] loss = 0.173421
[3/50][278/348] loss = 0.121838
[3/50][279/348] loss = 0.069104
[3/50][280/348] loss = 0.229877
[3/50][281/348] loss = 0.263160
[3/50][282/348] loss = 0.360480
[3/50][283/348] loss = 0.270337
[3/50][284/348] loss = 0.133991
[3/50][285/348] loss = 0.318469
[3/50][286/348] loss = 0.144661
[3/50][2

[4/50][168/348] loss = 0.162337
[4/50][169/348] loss = 0.020798
[4/50][170/348] loss = 0.074372
[4/50][171/348] loss = 0.039806
[4/50][172/348] loss = 0.047194
[4/50][173/348] loss = 0.049869
[4/50][174/348] loss = 0.038764
[4/50][175/348] loss = 0.044904
[4/50][176/348] loss = 0.151385
[4/50][177/348] loss = 0.076548
[4/50][178/348] loss = 0.182602
[4/50][179/348] loss = 0.029093
[4/50][180/348] loss = 0.241600
[4/50][181/348] loss = 0.214104
[4/50][182/348] loss = 0.025405
[4/50][183/348] loss = 0.021319
[4/50][184/348] loss = 0.091478
[4/50][185/348] loss = 0.156092
[4/50][186/348] loss = 0.106816
[4/50][187/348] loss = 0.118004
[4/50][188/348] loss = 0.020913
[4/50][189/348] loss = 0.166298
[4/50][190/348] loss = 0.225859
[4/50][191/348] loss = 0.124357
[4/50][192/348] loss = 0.036560
[4/50][193/348] loss = 0.034419
[4/50][194/348] loss = 0.213247
[4/50][195/348] loss = 0.090883
[4/50][196/348] loss = 0.075958
[4/50][197/348] loss = 0.051722
[4/50][198/348] loss = 0.236960
[4/50][1

[5/50][80/348] loss = 0.164463
[5/50][81/348] loss = 0.205239
[5/50][82/348] loss = 0.112582
[5/50][83/348] loss = 0.251886
[5/50][84/348] loss = 0.308658
[5/50][85/348] loss = 0.227748
[5/50][86/348] loss = 0.076018
[5/50][87/348] loss = 0.091063
[5/50][88/348] loss = 0.082427
[5/50][89/348] loss = 0.056587
[5/50][90/348] loss = 0.019521
[5/50][91/348] loss = 0.071949
[5/50][92/348] loss = 0.122923
[5/50][93/348] loss = 0.068090
[5/50][94/348] loss = 0.174459
[5/50][95/348] loss = 0.030445
[5/50][96/348] loss = 0.093201
[5/50][97/348] loss = 0.051051
[5/50][98/348] loss = 0.160557
[5/50][99/348] loss = 0.119776
[5/50][100/348] loss = 0.230640
[5/50][101/348] loss = 0.082672
[5/50][102/348] loss = 0.070067
[5/50][103/348] loss = 0.048818
[5/50][104/348] loss = 0.103243
[5/50][105/348] loss = 0.289066
[5/50][106/348] loss = 0.075405
[5/50][107/348] loss = 0.024680
[5/50][108/348] loss = 0.047196
[5/50][109/348] loss = 0.195081
[5/50][110/348] loss = 0.102819
[5/50][111/348] loss = 0.202

[5/50][337/348] loss = 0.107153
[5/50][338/348] loss = 0.027441
[5/50][339/348] loss = 0.110935
[5/50][340/348] loss = 0.139759
[5/50][341/348] loss = 0.054007
[5/50][342/348] loss = 0.032128
[5/50][343/348] loss = 0.170382
[5/50][344/348] loss = 0.143894
[5/50][345/348] loss = 0.091443
[5/50][346/348] loss = 0.015203
[5/50][347/348] loss = 0.042676
[5/50]
Training:

Accuracy = 98.52 %


Test:

Accuracy = 88.22 %


[6/50][0/348] loss = 0.087023
[6/50][1/348] loss = 0.036184
[6/50][2/348] loss = 0.067683
[6/50][3/348] loss = 0.080063
[6/50][4/348] loss = 0.012813
[6/50][5/348] loss = 0.041550
[6/50][6/348] loss = 0.036509
[6/50][7/348] loss = 0.220870
[6/50][8/348] loss = 0.128177
[6/50][9/348] loss = 0.199819
[6/50][10/348] loss = 0.021669
[6/50][11/348] loss = 0.192583
[6/50][12/348] loss = 0.124093
[6/50][13/348] loss = 0.136415
[6/50][14/348] loss = 0.125953
[6/50][15/348] loss = 0.034406
[6/50][16/348] loss = 0.064639
[6/50][17/348] loss = 0.181523
[6/50][18/348] loss = 0.145346
[6

[6/50][248/348] loss = 0.190975
[6/50][249/348] loss = 0.146928
[6/50][250/348] loss = 0.090708
[6/50][251/348] loss = 0.061237
[6/50][252/348] loss = 0.025062
[6/50][253/348] loss = 0.120249
[6/50][254/348] loss = 0.089731
[6/50][255/348] loss = 0.045164
[6/50][256/348] loss = 0.064258
[6/50][257/348] loss = 0.162256
[6/50][258/348] loss = 0.061239
[6/50][259/348] loss = 0.116462
[6/50][260/348] loss = 0.050904
[6/50][261/348] loss = 0.067773
[6/50][262/348] loss = 0.038619
[6/50][263/348] loss = 0.116284
[6/50][264/348] loss = 0.047295
[6/50][265/348] loss = 0.119826
[6/50][266/348] loss = 0.283696
[6/50][267/348] loss = 0.087162
[6/50][268/348] loss = 0.161157
[6/50][269/348] loss = 0.049828
[6/50][270/348] loss = 0.024165
[6/50][271/348] loss = 0.039224
[6/50][272/348] loss = 0.050593
[6/50][273/348] loss = 0.038739
[6/50][274/348] loss = 0.061340
[6/50][275/348] loss = 0.114021
[6/50][276/348] loss = 0.094816
[6/50][277/348] loss = 0.042253
[6/50][278/348] loss = 0.119081
[6/50][2

[7/50][160/348] loss = 0.050072
[7/50][161/348] loss = 0.176260
[7/50][162/348] loss = 0.034505
[7/50][163/348] loss = 0.121470
[7/50][164/348] loss = 0.021818
[7/50][165/348] loss = 0.028297
[7/50][166/348] loss = 0.032431
[7/50][167/348] loss = 0.054663
[7/50][168/348] loss = 0.016441
[7/50][169/348] loss = 0.019711
[7/50][170/348] loss = 0.017531
[7/50][171/348] loss = 0.006695
[7/50][172/348] loss = 0.003802
[7/50][173/348] loss = 0.088119
[7/50][174/348] loss = 0.050218
[7/50][175/348] loss = 0.072933
[7/50][176/348] loss = 0.048718


## Results
### Set-up
- Audio fetures MFCC
- 5 eposh training
- 3 second recordings
- Adam optimizer
- lr = 0.001
### Performance
- 95.71 accuracu traiing

In [None]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [None]:
save_checkpoint({
            'epoch': 5,
            'arch': 'CNN_voice_classifier',
            'state_dict': classifier.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, False, filename = 'model_weights/CNN_voice_classifier_5.pth')