In [1]:
import os
import numpy as np
import theano
import lasagne

import common as common
import nn_utilities as nn

In [2]:
current_dir = os.getcwd()
results_dir = os.path.join(current_dir, '..', 'results')
data_dir = os.path.join(current_dir, '..', 'data')
samples_dir = os.path.join(data_dir, 'segmented')
dm_dir = os.path.join(data_dir, 'distance_matrices', 'dtw')
params_dir = os.path.join(data_dir, 'params')

print 'Current directory:\t', current_dir
print 'Results directory:\t', results_dir
print 'Data directory:\t\t', data_dir
print 'Samples directory:\t', samples_dir
print 'DM directory:\t\t', dm_dir
print 'Params directory:\t', params_dir

references = common.load_references(samples_dir)
features = ['ste_10_10_norm', 'sti_10_10_norm', 'stzcr_10_10_norm', 'ste_sti_stzcr_10_10_norm', 'log_fb_en_25_10_ham_norm', 'log_fb_en_25_10_ham_deltas_norm', 'mfcc_25_10_ham_norm', 'mfcc_25_10_ham_deltas_norm']

Current directory:	/Users/martin.majer/PycharmProjects/BP/scripts
Results directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../results
Data directory:		/Users/martin.majer/PycharmProjects/BP/scripts/../data
Samples directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../data/segmented
DM directory:		/Users/martin.majer/PycharmProjects/BP/scripts/../data/distance_matrices/dtw
Params directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../data/params


In [3]:
speakers = [str(x + 1) for x in xrange(6)]

num_units = 200
depth = 2
drop_input=None
drop_hidden=None

num_epochs = 1500
batch_count = 10

save_params=False

if save_params:
    nn.dump_info(os.path.join(params_dir, 'nn_model.txt'), depth, num_units, drop_input, drop_hidden)

## Single speaker

In [4]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    feature_accuracies = []
    
    for speaker in speakers:
        print('{0: <12}{1}'.format('Speaker:', speaker))
        ref_keys = common.get_speaker_keys(ref_keys_all, speaker)
        test_keys = common.get_speaker_keys(test_keys_all, speaker)
        
        X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)
        
        input_dim = X_train.shape[1]
        output_dim = y_train.shape[1]
        
        params_file = os.path.join(params_dir, feature + '_' + speaker + '.npz')
        try:
            with np.load(params_file) as fr:
                param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
            print 'Loading model parameters...'
            
            network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            lasagne.layers.set_all_param_values(network, param_values)
        except IOError:
            print 'Starting training...'
            network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                               input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            if save_params:
                np.savez(params_file, *lasagne.layers.get_all_param_values(network))
        
        print '\nComputing recognition accuracy...\n'
        recognition = nn.recognize(X_test, network)
        accuracy = nn.compute_accuracy(y_test, recognition)
        feature_accuracies.append(accuracy)
        
    feature_accuracies.append(np.mean(feature_accuracies))
    accuracy_data[feature] = feature_accuracies
    
nn_single = common.create_dataframe(accuracy_data, speakers + ['Mean'])

Features used: ste_10_10_norm

Speaker:    1
X_train:    (10, 10).float64        y_train:    (10, 10).int32
X_val:      (30, 10).float64        y_val:      (30, 10).int32
X_test:     (30, 10).float64        y_test:     (30, 10).int32

Starting training...
Epoch 50 of 1500 took 0.002s
	training loss:			2.222576
	validation loss:		2.260372
	validation accuracy:		26.67 %
Epoch 100 of 1500 took 0.002s
	training loss:			2.103786
	validation loss:		2.194621
	validation accuracy:		43.33 %
Epoch 150 of 1500 took 0.002s
	training loss:			1.865358
	validation loss:		2.060201
	validation accuracy:		40.00 %
Epoch 200 of 1500 took 0.004s
	training loss:			1.441847
	validation loss:		1.822147
	validation accuracy:		46.67 %
Epoch 250 of 1500 took 0.002s
	training loss:			1.016091
	validation loss:		1.605987
	validation accuracy:		46.67 %
Epoch 300 of 1500 took 0.002s
	training loss:			0.731621
	validation loss:		1.504786
	validation accuracy:		46.67 %
Epoch 350 of 1500 took 0.002s
	training loss:			0

In [5]:
nn_single

Unnamed: 0,1,2,3,4,5,6,Mean
log_fb_en_25_10_ham_deltas_norm,76.666667,80.0,73.333333,73.333333,53.333333,73.333333,71.666667
log_fb_en_25_10_ham_norm,86.666667,93.333333,93.333333,96.666667,93.333333,90.0,92.222222
mfcc_25_10_ham_deltas_norm,80.0,70.0,83.333333,86.666667,70.0,86.666667,79.444444
mfcc_25_10_ham_norm,93.333333,96.666667,96.666667,96.666667,90.0,90.0,93.888889
ste_10_10_norm,50.0,23.333333,46.666667,36.666667,33.333333,53.333333,40.555556
ste_sti_stzcr_10_10_norm,93.333333,63.333333,86.666667,86.666667,63.333333,83.333333,79.444444
sti_10_10_norm,56.666667,46.666667,60.0,63.333333,40.0,50.0,52.777778
stzcr_10_10_norm,63.333333,50.0,46.666667,43.333333,30.0,53.333333,47.777778


In [6]:
csv_name = os.path.join(results_dir, 'nn_single_speaker.csv')

with open(csv_name, 'w') as fw:
    nn_single.to_csv(fw)

## All speakers

In [13]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    ref_keys = []
    test_keys = []  
    
    for speaker in speakers:
        ref_keys.extend(common.get_speaker_keys(ref_keys_all, speaker))
        test_keys.extend(common.get_speaker_keys(test_keys_all, speaker))
    
    X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)

    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    params_file = os.path.join(params_dir, feature + '_all.npz')
    try:
        with np.load(params_file) as fr:
            param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
        print 'Loading model parameters...'

        network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
        lasagne.layers.set_all_param_values(network, param_values)
    except IOError:
        print 'Starting training...'
        network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                           input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
        if save_params:
            np.savez(params_file, *lasagne.layers.get_all_param_values(network))

    print '\nComputing recognition accuracy...\n'
    recognition = nn.recognize(X_test, network)
    accuracy = nn.compute_accuracy(y_test, recognition)
    accuracy_data[feature] = accuracy
    
nn_all = common.create_dataframe(accuracy_data, ['All speakers'])

Features used: ste_10_10_norm

X_train:    (60, 60).float64        y_train:    (60, 60).int32
X_val:     (180, 60).float64        y_val:     (180, 60).int32
X_test:    (180, 60).float64        y_test:    (180, 60).int32

Starting training...
Epoch 50 of 1500 took 0.016s
	training loss:			1.372280
	validation loss:		1.725393
	validation accuracy:		33.33 %
Epoch 100 of 1500 took 0.016s
	training loss:			0.973356
	validation loss:		1.714517
	validation accuracy:		37.78 %
Epoch 150 of 1500 took 0.016s
	training loss:			0.517709
	validation loss:		1.790333
	validation accuracy:		45.56 %
Epoch 200 of 1500 took 0.019s
	training loss:			0.247783
	validation loss:		2.127540
	validation accuracy:		43.89 %
Epoch 250 of 1500 took 0.019s
	training loss:			0.126071
	validation loss:		2.243559
	validation accuracy:		47.78 %
Epoch 300 of 1500 took 0.016s
	training loss:			0.071688
	validation loss:		2.406993
	validation accuracy:		47.78 %
Epoch 350 of 1500 took 0.018s
	training loss:			0.037203
	valid

In [14]:
nn_all

Unnamed: 0,All speakers
log_fb_en_25_10_ham_deltas_norm,10.0
log_fb_en_25_10_ham_norm,87.222222
mfcc_25_10_ham_deltas_norm,10.0
mfcc_25_10_ham_norm,10.0
ste_10_10_norm,51.666667
ste_sti_stzcr_10_10_norm,77.777778
sti_10_10_norm,60.555556
stzcr_10_10_norm,58.333333


In [15]:
csv_name = os.path.join(results_dir, 'nn_all_speakers.csv')

with open(csv_name, 'w') as fw:
    nn_all.to_csv(fw)

## All test data against one speaker

In [10]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    feature_accuracies = []
    test_keys = []  
    
    for speaker in speakers:
        test_keys.extend(common.get_speaker_keys(test_keys_all, speaker))
    
    for speaker in speakers:
        print('{0: <12}{1}'.format('Speaker:', speaker))
        ref_keys = common.get_speaker_keys(ref_keys_all, speaker)
        
        X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)
        
        input_dim = X_train.shape[1]
        output_dim = y_train.shape[1]
        
        params_file = os.path.join(params_dir, feature + '_' + speaker + '.npz')
        try:
            with np.load(params_file) as fr:
                param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
            print 'Loading model parameters...'
            
            network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            lasagne.layers.set_all_param_values(network, param_values)
        except IOError:
            print 'Starting training...'
            network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                               input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            if save_params:
                np.savez(params_file, *lasagne.layers.get_all_param_values(network))
        
        print '\nComputing recognition accuracy...\n'
        recognition = nn.recognize(X_test, network)
        accuracy = nn.compute_accuracy(y_test, recognition)
        feature_accuracies.append(accuracy)
        
    feature_accuracies.append(np.mean(feature_accuracies))
    accuracy_data[feature] = feature_accuracies
    
nn_all_test_per_speaker = common.create_dataframe(accuracy_data, speakers + ['Mean'])

Features used: ste_10_10_norm

Speaker:    1
X_train:    (10, 10).float64        y_train:    (10, 10).int32
X_val:     (180, 10).float64        y_val:     (180, 10).int32
X_test:    (180, 10).float64        y_test:    (180, 10).int32

Starting training...
Epoch 50 of 1500 took 0.007s
	training loss:			2.207175
	validation loss:		2.275183
	validation accuracy:		19.44 %
Epoch 100 of 1500 took 0.006s
	training loss:			2.062704
	validation loss:		2.233069
	validation accuracy:		18.89 %
Epoch 150 of 1500 took 0.006s
	training loss:			1.780459
	validation loss:		2.171182
	validation accuracy:		17.78 %
Epoch 200 of 1500 took 0.006s
	training loss:			1.336390
	validation loss:		2.161606
	validation accuracy:		20.00 %
Epoch 250 of 1500 took 0.007s
	training loss:			0.951066
	validation loss:		2.289918
	validation accuracy:		19.44 %
Epoch 300 of 1500 took 0.006s
	training loss:			0.695432
	validation loss:		2.528377
	validation accuracy:		23.89 %
Epoch 350 of 1500 took 0.006s
	training loss:			0

In [11]:
nn_all_test_per_speaker

Unnamed: 0,1,2,3,4,5,6,Mean
log_fb_en_25_10_ham_deltas_norm,55.0,32.222222,47.222222,36.111111,36.666667,51.666667,43.148148
log_fb_en_25_10_ham_norm,70.0,68.888889,82.222222,70.555556,80.0,85.0,76.111111
mfcc_25_10_ham_deltas_norm,61.666667,63.333333,68.888889,52.777778,60.0,67.777778,62.407407
mfcc_25_10_ham_norm,67.222222,76.111111,68.333333,68.333333,61.111111,75.0,69.351852
ste_10_10_norm,25.555556,23.333333,25.555556,28.333333,21.111111,25.0,24.814815
ste_sti_stzcr_10_10_norm,56.666667,47.222222,65.555556,61.111111,48.333333,63.888889,57.12963
sti_10_10_norm,24.444444,32.222222,31.111111,32.222222,28.888889,35.0,30.648148
stzcr_10_10_norm,45.0,42.222222,44.444444,38.888889,42.777778,46.111111,43.240741


In [12]:
csv_name = os.path.join(results_dir, 'nn_all_test_per_speaker.csv')

with open(csv_name, 'w') as fw:
    nn_all_test_per_speaker.to_csv(fw)