In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
from os.path import join
from os import listdir
import os
import math

In [43]:
pt = '/home/selp/Documents/AU/Speech/VCTK-Corpus/wav48'
for path in [join(pt, f) for f in listdir(pt)]:
    with open(join(path,'wav.scp'),'w+') as file:
        for f in sorted(listdir(path)):
            if f[0] == 'p':
                file.write('{} {}\n'.format(f, join(path, f)))

In [129]:
%time
pt = '/home/selp/Documents/AU/Speech/VCTK-Corpus/wav48'
for path in [join(pt, f) for f in sorted(listdir(pt))]:
    os.system('{} --config={} scp:{} ark:{}'.format('/home/selp/Documents/kaldi/src/featbin/compute-mfcc-feats',
        '/home/selp/Documents/AU/Speech/mfcc.conf', join(path, 'wav.scp'), join(path, 'mfcc.ark')))
    os.system('{} --delta-order=2 ark:{} ark:{}'.format('/home/selp/Documents/kaldi/src/featbin/add-deltas',
        join(path, 'mfcc.ark'), join(path, 'mfcc_delta.ark')))
    print(path)

In [130]:
import kaldi_io
import numpy as np


keys_lists = list()
keys_lists_noisy = list()
data = list()
data_noisy = list()
pt = '/home/selp/Documents/AU/Speech/VCTK-Corpus/wav48'
for path in [join(pt, f) for f in sorted(listdir(pt))]:
    global_id = 0
    keys_list = list()
    keys_list.append(global_id)
    if path[-1] == 'y':
        data_noisy.append(np.array([]))
    else:
        data.append(np.array([]))
    for key,mat in kaldi_io.read_mat_ark(join(path, 'mfcc_delta.ark')):
        if path[-1] == 'y':
            if data_noisy[-1].shape[0] == 0:
                data_noisy[-1] = mat
            else:
                data_noisy[-1] = np.append(data_noisy[-1], mat, axis=0)
        else:
            if data[-1].shape[0] == 0:
                data[-1] = mat
            else:
                data[-1] = np.append(data[-1], mat, axis=0)
        global_id += mat.shape[0]
        keys_list.append(global_id)
    if path[-1] == 'y':
        keys_lists_noisy.append(np.array(keys_list))
    else:
        keys_lists.append(np.array(keys_list))
    print(path)


In [58]:
def prepare_speakers_data(data, keys_lists):
    data_speakers = list()
    data_speakers_test = list()
    for i in range(len(data)):
        if i != 108:
            keys = keys_lists[i]
            dat = data[i]
            for j in range(len(keys)-1):
                cls = i
                if i >= 108:
                    cls = i - 1
                y = np.zeros(len(data) - 1)
                y[cls] = 1
                if j < ((len(keys)-1)*4//5):
                    data_speakers.append((dat[keys[j]:keys[j+1]], y))
                else:
                    data_speakers_test.append((dat[keys[j]:keys[j+1]], y))
    return data_speakers, data_speakers_test        
        

In [30]:
import random
random.shuffle(data_speakers)

In [32]:
print("train tracks: ", len(data_speakers), "\n test tracks: ", len(data_speakers_test))

train tracks:  35113 
 test tracks:  8837


In [3]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.layer1 = nn.Linear(39, 30)
        self.layer2 = nn.Linear(30, 21)#30,21
        self.layer3 = nn.Linear(21, 14)#21,14
        self.layer4 = nn.Linear(14, 21)#14,21
        self.layer5 = nn.Linear(21, 30)#21,30
        self.layer6 = nn.Linear(30, 39)
    def forward(self, input_data):
        x = F.relu(self.layer1(input_data))
        x = F.relu(self.layer2(x))#dr_o(0.1)
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))#dr_o(0.1)
        x = F.relu(self.layer5(x))
        x = self.layer6(x)
        return x

In [4]:
data_full = np.copy(data[0][:data[0].shape[0]*4//5])
data_noisy_full = np.copy(data_noisy[0][:data_noisy[0].shape[0]*4//5])
data_full_test = np.copy(data[0][data[0].shape[0]*4//5:])
data_noisy_full_test = np.copy(data_noisy[0][data_noisy[0].shape[0]*4//5:])
for i in range(1, len(data)):
    if i != 108:
        data_full = np.append(data_full, data[i][:data[i].shape[0]*4//5], axis=0)
        data_full_test = np.append(data_full_test, data[i][data[i].shape[0]*4//5:], axis=0)
        data_noisy_full = np.append(data_noisy_full, data_noisy[i][:data_noisy[i].shape[0]*4//5], axis=0)
        data_noisy_full_test = np.append(data_noisy_full_test, data_noisy[i][data_noisy[i].shape[0]*4//5:], axis=0)

In [5]:
print(data_full.shape)
print(data_noisy_full.shape)
print(data_full_test.shape)
print(data_noisy_full_test.shape)

(12500841, 39)
(12500841, 39)
(3125262, 39)
(3125262, 39)


In [6]:
dl_noisy = DataLoader(torch.tensor(data_noisy_full), batch_size=1024)

In [7]:
aeMod = AutoEncoder()

In [8]:
def train(model, epoch_num, learning_rate, batch_size, dl_noisy, dl_ob, logging=False, writer=None):
    loss_function = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)
    global step

    for epoch in range(epoch_num):
        for i, X in enumerate(dl_noisy):
            y = dl_ob[batch_size*i: batch_size*(i + 1)]

            model.zero_grad()
            res = model(X)
            loss = loss_function(res, torch.FloatTensor(y))
        
            if logging:
                writer.add_scalar('loss_train/loss', loss, step)

            loss.backward()
            optimizer.step()
            step += 1
        print(epoch)

In [57]:
# model = AutoEncoder()
# checkpoint = torch.load('checkpoint_autoencoder.pth.tar')
# start_epoch = checkpoint['epoch']
# model.load_state_dict(checkpoint['state_dict'])

In [9]:
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss'))

In [11]:
%%time
train(aeMod, 2, 0.00001, 1024, dl_noisy, data_full, True, writer)#4 epoches
train(aeMod, 2, 0.00005, 1024, dl_noisy, data_full, True, writer)#4 epoches
train(aeMod, 2, 0.00015, 1024, dl_noisy, data_full, True, writer)#2
train(aeMod, 2, 0.0005, 1024, dl_noisy, data_full, True, writer)#4
train(aeMod, 2, 0.001, 1024, dl_noisy, data_full, True, writer)#2
train(aeMod, 1, 0.0005, 1024, dl_noisy, data_full, True, writer)#1
train(aeMod, 1, 0.0002, 1024, dl_noisy, data_full, True, writer)#1
train(aeMod, 1, 0.0005, 1024, dl_noisy, data_full, True, writer)#1
##time on 2 epoches

0
1
CPU times: user 21min 2s, sys: 3.08 s, total: 21min 5s
Wall time: 1min 56s


In [21]:
torch.save({
    'epoch': 19,
    'state_dict': aeMod.state_dict()
}, 'checkpoint_autoencoder.pth.tar')

In [15]:
nn.MSELoss()(torch.FloatTensor(data_noisy_full), torch.FloatTensor(data_full))

tensor(124.7916)

In [None]:
nn.MSELoss()(aeMod(torch.FloatTensor(data_noisy_full)), torch.FloatTensor(data_full))

In [30]:
min_sections = 500
max_sections = 200
for keys in keys_lists:
    arr = keys[1:] - keys[:-1]
    min_v = np.min(arr)
    max_v = np.max(arr)
    if min_v < min_sections:
        min_sections = min_v
    if max_v > max_sections:
        max_sections = max_v
print("min:{}\n max:{}".format(min_sections, max_sections))

min:58
 max:1926


In [22]:
for i in range(len(data)):
    if data[i].shape != data_noisy[i].shape:
        print(i)
        print("    {}".format(data[i].shape))
        print("    {}".format(data_noisy[i].shape))

108
    (139202, 39)
    (139725, 39)


In [26]:
class LSTMClassifier(nn.Module):

    def __init__(self, features_num, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(features_num, hidden_dim)
        
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_data):
        _, (lstm_out, _) = self.lstm(input_data)
        
        x = F.relu(self.linear1(lstm_out))
        x = F.relu(self.linear2(x))
        x = torch.nn.Softmax(dim=2)(self.linear3(x))
        
        return x

In [108]:
def train(data_batched, data_y, learning_rate = 0.01, epoches=1, logging=False, writer=None, log_name='loss', classifier=classifier):
    global step
    loss_fun = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate)
    for epoch in range(epoches):
        for it in range(len(data_batched)):
            X, y = data_batched[it], data_y[it]
            classifier.zero_grad()
            res = classifier(X)
            loss = loss_fun(res, torch.FloatTensor(y.reshape(1, -1, 108)))
            if logging:
                writer.add_scalar('loss_train/' + log_name, loss, step)
            step += 1
            it += 1
            loss.backward()
            optimizer.step()
        

In [28]:
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss_classifier_clear'))

In [27]:
classifier = LSTMClassifier(39, 128, len(data) - 1)

In [None]:
data_batched, data_y = prepare_data(data_speakers, 256)
data_batched_test, data_y_test = prepare_data(data_speakers_test, 256)

In [43]:
train(data_batched, data_y, 0.01, 2, True, writer)
train(data_batched, data_y, 0.005, 1, True, writer)
train(data_batched, data_y, 0.002, 1, True, writer)

In [46]:
%%time
train(data_batched, data_y, 0.0008, 1, True, writer)

CPU times: user 13min 3s, sys: 6.53 s, total: 13min 10s
Wall time: 2min 13s


In [52]:
#accuracy with classifier trained on clear data for 5 epoches
print(accuracy_classifier(data_batched_test))

0.7344121308136246


In [53]:
torch.save({
    'epoch': 3,
    'state_dict': classifier.state_dict(),
}, 'checkpoint_clsssifier_on_clear_data.pth.tar')



In [55]:
data_autoencoded = list()
for d in data_noisy:
    data_autoencoded.append(aeMod(torch.FloatTensor(d)).detach().numpy().reshape(-1, 39))

In [59]:
ae_speakers_train, ae_speakers_test = prepare_speakers_data(data_autoencoded, keys_lists_noisy)

In [60]:
random.shuffle(ae_speakers_train)

In [88]:
ae_data_batched, ae_data_y = prepare_data(ae_speakers_train, 256)
ae_data_batched_test, ae_data_y_test = prepare_data(ae_speakers_test, 256)

In [89]:
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss_classifier_ae'))

In [90]:
classifier = LSTMClassifier(39, 128, len(data) - 1)

In [91]:
%%time
train(ae_data_batched, ae_data_y, 0.01, 2, True, writer)
train(ae_data_batched, ae_data_y, 0.005, 1, True, writer)
train(ae_data_batched, ae_data_y, 0.002, 1, True, writer)

CPU times: user 1h 1min 36s, sys: 32.1 s, total: 1h 2min 8s
Wall time: 10min 37s


In [126]:
%%time
train(ae_data_batched, ae_data_y, 0.0008, 3, True, writer)
##time for 1 epoch

CPU times: user 14min 8s, sys: 5.44 s, total: 14min 14s
Wall time: 2min 24s


In [127]:
#accuracy with classifier trained on cleaned by autoencoder data for 7 epoches
print("autoencoded test data accuracy: ", accuracy_classifier(ae_data_batched_test))
print("clear test data accuracy: ", accuracy_classifier(data_batched_test))

autoencoded test data accuracy:  0.553807853343895
clear test data accuracy:  0.05216702500848704


In [128]:
torch.save({
    'epoch': 7,
    'state_dict': classifier.state_dict(),
}, 'checkpoint_clsssifier_on_ae_data.pth.tar')

In [102]:
classifier_13 = LSTMClassifier(13, 128, len(data) - 1)
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss_mfcc_13'))
data_batched_13, data_y_13 = prepare_data(data_speakers, 256, 0, 13)
data_batched_test_13, data_y_test_13 = prepare_data(data_speakers_test, 256, 0, 13)

In [110]:
train(data_batched_13, data_y_13, 0.01, 2, True, writer, log_name='loss_mfcc_13', classifier=classifier_13)
train(data_batched_13, data_y_13, 0.005, 1, True, writer, log_name='loss_mfcc_13', classifier=classifier_13)

In [115]:
#accuracy with classifier trained on mfcc features (13) for 3 epoches
print(accuracy_classifier(data_batched_test_13, classifier=classifier_13))

0.6520312323186602


In [119]:
classifier_13 = LSTMClassifier(13, 128, len(data) - 1)
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss_mfcc_deltas_1'))
data_batched_13, data_y_13 = prepare_data(data_speakers, 256, 13, 26)
data_batched_test_13, data_y_test_13 = prepare_data(data_speakers_test, 256, 13, 26)

In [121]:
%%time
train(data_batched_13, data_y_13, 0.01, 2, True, writer, log_name='loss_mfcc_deltas_1', classifier=classifier_13)
train(data_batched_13, data_y_13, 0.005, 1, True, writer, log_name='loss_mfcc_deltas_1', classifier=classifier_13)
##a little bit corrupted loss values

CPU times: user 55min 51s, sys: 17.8 s, total: 56min 9s
Wall time: 9min 29s


In [122]:
#accuracy with classifier trained on mfcc delatas_1 (13) for 3 epoches
print(accuracy_classifier(data_batched_test_13, classifier=classifier_13))

0.08464410999207876


In [123]:
classifier_13 = LSTMClassifier(13, 128, len(data) - 1)
step = 1
writer = SummaryWriter(log_dir=join('save_root', 'loss_mfcc_deltas_2'))
data_batched_13, data_y_13 = prepare_data(data_speakers, 256, 26, 39)
data_batched_test_13, data_y_test_13 = prepare_data(data_speakers_test, 256, 26, 39)

In [124]:
%%time
train(data_batched_13, data_y_13, 0.01, 2, True, writer, log_name='loss_mfcc_deltas_2', classifier=classifier_13)
train(data_batched_13, data_y_13, 0.005, 1, True, writer, log_name='loss_mfcc_deltas_2', classifier=classifier_13)

CPU times: user 1h 2min 17s, sys: 18 s, total: 1h 2min 35s
Wall time: 10min 33s


In [125]:
#accuracy with classifier trained on mfcc delatas_2 (13) for 3 epoches
print(accuracy_classifier(data_batched_test_13, classifier=classifier_13))

0.04899852891252687


In [114]:
def accuracy_classifier(data_batches_test, classifier=classifier):
    true_val = 0

    for it in range(len(data_batches_test)):
        X, y = data_batches_test[it], data_y_test[it]
        res = classifier(X)
        true_val += np.sum(np.argmax(res.detach().numpy().reshape(-1, 108), axis=1) == np.argmax(y, axis=1))
    return true_val/len(data_speakers_test) 

In [118]:
def prepare_data(shuffled_data, batch_size, begin=0, end=39):
    data_batched = list()
    data_y = list()
    for i in range(math.ceil(len(shuffled_data)/batch_size)):
        batch_lens = list()#np.zeros((batch_size,))
        for x, _ in shuffled_data[batch_size*i: batch_size*(i + 1)]:
            batch_lens.append(x.shape[0])
        batch_lengths = np.array(batch_lens)
        max_len = np.max(batch_lens)
        batch = np.zeros((max_len, batch_lengths.shape[0], end - begin))
        dt_y = np.zeros((batch_lengths.shape[0], 108))
        j = 0
        for x, y in shuffled_data[batch_size*i: batch_size*(i + 1)]:
            batch[0:batch_lengths[j], j] = x[:, begin:end]
            dt_y[j] = y
            j += 1
        args = np.argsort(-batch_lengths)
        data_batched.append(torch.nn.utils.rnn.pack_padded_sequence(torch.FloatTensor(batch[:,args,:]), batch_lengths[args]))
        data_y.append(dt_y[args])
    return data_batched, data_y