## Basic imports

In [1]:
import sys 
import os
import numpy as np 
import matplotlib.pyplot as plt
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

print("Python: %s" % sys.version)
print("Pytorch: %s" % torch.__version__)

# determine device to run network on (runs on gpu if available)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#audioviz
import librosa as libr
import librosa.display as display
import IPython.display

import pandas as pd

Python: 3.6.5 (default, Jul  6 2018, 19:12:46) 
[GCC 5.4.0 20160609]
Pytorch: 0.4.0


## Hyperparameters

In [2]:
n_seconds = 3
n_epochs = 50
sampling_rate = 16000
number_of_mels =128
all_data = ['train-clean-360']
lr = 0.001

## Speech preprocessing
Buidling tensorToMFCC transformation for learning

In [3]:
class tensorToMFCC:
    def __call__(self, y):
#         y = y.numpy()
        dims = y.shape
        y = libr.feature.melspectrogram(np.reshape(y, (dims[1],)), 16000, n_mels=number_of_mels,
                               fmax=8000)
        y = libr.feature.mfcc(S = libr.power_to_db(y))
        y = torch.from_numpy(y)                           
        return y.float()

In [4]:
transform  = tensorToMFCC()

## LibriSpeechDataSet
Load personalized data set, inspred by this [repository](https://github.com/oscarknagg/voicemap/tree/pytorch-python-3.6)

In [5]:
%load_ext autoreload
%autoreload 2
sys.path.insert(0, './../../Utils')
from datasets import LibriSpeechDataset
from datasets import Libri_preload_and_split

In [6]:
path = 'data/'

splits = [0.8, 0.2] #input fraction of data you want partitioned
attacking = False

if sum(splits) != 1:
    print('error: splits do not sum to 1.')

#Splits data into above defined train:test splits
dfs = Libri_preload_and_split(path,all_data,n_seconds,pad=False,cache=True,splits=splits, attacking = attacking)    

#target train & test
valid_sequence_train = LibriSpeechDataset(path, df = dfs[0], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

valid_sequence_test = LibriSpeechDataset(path, df = dfs[1], seconds = n_seconds, downsampling=1, 
                                    transform = transform, stochastic=False)

Initialising LibriSpeechDataset with minimum length = 3s and subsets = ['train-clean-360']
Finished indexing data. 101703 usable files found.


UnboundLocalError: local variable 'unique_speakers1' referenced before assignment

In [None]:
# Loaders for data for baseline model
train_loader = DataLoader(valid_sequence_train,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )

test_loader = DataLoader(valid_sequence_test,
                      batch_size=32,
                      shuffle=True,
                      num_workers=8
                     # pin_memory=True # CUDA only
                     )

In [None]:
recording, speaker  = iter(train_loader).next()

In [None]:
print(recording.shape)
print(valid_sequence_train.num_speakers)

## Cyphercat utilities

In [None]:
sys.path.insert(0,'../../Utils/')
from train import *
from metrics import * 
import models
from data_downloaders import * 

## Models

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, n_input, n_out, kernel_size):
        super(ConvBlock, self).__init__()
        self.cnn_block = nn.Sequential(
            nn.Conv1d(n_input, n_out, kernel_size, padding=1),
            nn.BatchNorm1d(n_out),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4)
        )
    
    def forward(self, x):
        return self.cnn_block(x)


class CNN_classifier(nn.Module):
    def __init__(self, in_size, n_hidden, n_classes):
        super(CNN_classifier, self).__init__()
        self.down_path = nn.ModuleList()
        self.down_path.append(ConvBlock(in_size, 2*in_size, 3))
        self.down_path.append(ConvBlock(2*in_size, 4*in_size, 3))
        self.down_path.append(ConvBlock(4*in_size, 8*in_size, 3))
        self.fc = nn.Sequential(
            nn.Linear(8*in_size, n_hidden),
            nn.ReLU()
        )
        self.out = nn.Linear(n_hidden, n_classes)
    def forward(self, x):
        for down in self.down_path:
            x = down(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.out(x)
        

In [None]:
test = ConvBlock(20, 40, 3)
aa = test(recording)
print(aa.shape)

In [None]:
valid_sequence_test.num_speakers

In [None]:
classifier = CNN_classifier(20, 512, valid_sequence_test.num_speakers)
# classifier.apply(models.weights_init)
classifier.to(device)

In [None]:
test = classifier(recording.to(device))
print(test.shape)

In [None]:
optimizer = optim.Adam(classifier.parameters(), lr)
criterion = nn.CrossEntropyLoss()

In [None]:
train(classifier, train_loader, test_loader, optimizer, criterion, 50, verbose = False)

## Results
### Set-up
- Audio fetures MFCC
- 5 eposh training
- 3 second recordings
- Adam optimizer
- lr = 0.001
### Performance
- 95.71 accuracu traiing

In [None]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [None]:
epoch = 39
save_checkpoint({
            'epoch': epoch,
            'arch': 'CNN_voice_classifier',
            'state_dict': classifier.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, False, filename = 'model_weights/CNN_voice_classifier360all_'+str(epoch)+'.pth')