In [1]:
import os
import numpy as np
import theano
import lasagne

import common as common
import nn_utilities as nn

In [2]:
current_dir = os.getcwd()
results_dir = os.path.join(current_dir, '..', 'results')
data_dir = os.path.join(current_dir, '..', 'data')
samples_dir = os.path.join(data_dir, 'segmented')
dm_dir = os.path.join(data_dir, 'distance_matrices', 'dtw')
params_dir = os.path.join(data_dir, 'params')

print 'Current directory:\t', current_dir
print 'Results directory:\t', results_dir
print 'Data directory:\t\t', data_dir
print 'Samples directory:\t', samples_dir
print 'DM directory:\t\t', dm_dir
print 'Params directory:\t', params_dir

references = common.load_references(samples_dir)
features = ['ste_10_10_norm', 'sti_10_10_norm', 'stzcr_10_10_norm', 'ste_sti_stzcr_10_10_norm', 'log_fb_en_25_10_ham_norm', 'log_fb_en_25_10_ham_deltas_norm', 'mfcc_25_10_ham_norm', 'mfcc_25_10_ham_deltas_norm']

Current directory:	/Users/martin.majer/PycharmProjects/BP/scripts
Results directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../results
Data directory:		/Users/martin.majer/PycharmProjects/BP/scripts/../data
Samples directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../data/segmented
DM directory:		/Users/martin.majer/PycharmProjects/BP/scripts/../data/distance_matrices/dtw
Params directory:	/Users/martin.majer/PycharmProjects/BP/scripts/../data/params


In [3]:
speakers = [str(x + 1) for x in xrange(6)]

num_units = 200
depth = 2
drop_input=None
drop_hidden=None

num_epochs = 1500
batch_count = 10

save_params=False

if save_params:
    nn.dump_info(os.path.join(params_dir, 'nn_model.txt'), depth, num_units, drop_input, drop_hidden)

## Single speaker

In [4]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    feature_accuracies = []
    
    for speaker in speakers:
        print('{0: <12}{1}'.format('Speaker:', speaker))
        ref_keys = common.get_speaker_keys(ref_keys_all, speaker)
        test_keys = common.get_speaker_keys(test_keys_all, speaker)
        
        X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)
        
        input_dim = X_train.shape[1]
        output_dim = y_train.shape[1]
        
        params_file = os.path.join(params_dir, feature + '_' + speaker + '.npz')
        try:
            with np.load(params_file) as fr:
                param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
            print 'Loading model parameters...'
            
            network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            lasagne.layers.set_all_param_values(network, param_values)
        except IOError:
            print 'Starting training...'
            network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                               input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            if save_params:
                np.savez(params_file, *lasagne.layers.get_all_param_values(network))
        
        print '\nComputing recognition accuracy...\n'
        recognition = nn.recognize(X_test, network)
        accuracy = nn.compute_accuracy(y_test, recognition)
        feature_accuracies.append(accuracy)
        
    feature_accuracies.append(np.mean(feature_accuracies))
    accuracy_data[feature] = feature_accuracies
    
nn_single = common.create_dataframe(accuracy_data, speakers + ['Mean'])

Features used: ste_10_10_norm

Speaker:    1
X_train:    (10, 10).float64        y_train:    (10, 10).int32
X_val:      (30, 10).float64        y_val:      (30, 10).int32
X_test:     (30, 10).float64        y_test:     (30, 10).int32

Starting training...
Epoch 50 of 1500 took 0.002s
	training loss:			2.193079
	validation loss:		2.249938
	validation accuracy:		43.33 %
Epoch 100 of 1500 took 0.002s
	training loss:			2.028868
	validation loss:		2.162551
	validation accuracy:		40.00 %
Epoch 150 of 1500 took 0.003s
	training loss:			1.703952
	validation loss:		1.987834
	validation accuracy:		36.67 %
Epoch 200 of 1500 took 0.002s
	training loss:			1.253736
	validation loss:		1.746005
	validation accuracy:		36.67 %
Epoch 250 of 1500 took 0.002s
	training loss:			0.889375
	validation loss:		1.582684
	validation accuracy:		33.33 %
Epoch 300 of 1500 took 0.007s
	training loss:			0.648846
	validation loss:		1.519761
	validation accuracy:		46.67 %
Epoch 350 of 1500 took 0.002s
	training loss:			0

In [5]:
nn_single

Unnamed: 0,1,2,3,4,5,6,Mean
log_fb_en_25_10_ham_deltas_norm,86.666667,73.333333,80.0,66.666667,66.666667,76.666667,75.0
log_fb_en_25_10_ham_norm,86.666667,93.333333,93.333333,100.0,93.333333,90.0,92.777778
mfcc_25_10_ham_deltas_norm,93.333333,73.333333,76.666667,86.666667,83.333333,86.666667,83.333333
mfcc_25_10_ham_norm,93.333333,96.666667,93.333333,96.666667,86.666667,90.0,92.777778
ste_10_10_norm,53.333333,20.0,50.0,36.666667,36.666667,53.333333,41.666667
ste_sti_stzcr_10_10_norm,96.666667,66.666667,76.666667,86.666667,66.666667,73.333333,77.777778
sti_10_10_norm,53.333333,50.0,56.666667,53.333333,46.666667,56.666667,52.777778
stzcr_10_10_norm,53.333333,50.0,50.0,46.666667,30.0,56.666667,47.777778


In [6]:
csv_name = os.path.join(results_dir, 'nn_single_speaker.csv')

with open(csv_name, 'w') as fw:
    nn_single.to_csv(fw)

## All speakers

In [7]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    ref_keys = []
    test_keys = []  
    
    for speaker in speakers:
        ref_keys.extend(common.get_speaker_keys(ref_keys_all, speaker))
        test_keys.extend(common.get_speaker_keys(test_keys_all, speaker))
    
    X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)

    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    params_file = os.path.join(params_dir, feature + '_all.npz')
    try:
        with np.load(params_file) as fr:
            param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
        print 'Loading model parameters...'

        network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
        lasagne.layers.set_all_param_values(network, param_values)
    except IOError:
        print 'Starting training...'
        network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                           input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
        if save_params:
            np.savez(params_file, *lasagne.layers.get_all_param_values(network))

    print '\nComputing recognition accuracy...\n'
    recognition = nn.recognize(X_test, network)
    accuracy = nn.compute_accuracy(y_test, recognition)
    accuracy_data[feature] = accuracy
    
nn_all = common.create_dataframe(accuracy_data, ['All speakers'])

Features used: ste_10_10_norm

X_train:    (60, 60).float64        y_train:    (60, 60).int32
X_val:     (180, 60).float64        y_val:     (180, 60).int32
X_test:    (180, 60).float64        y_test:    (180, 60).int32

Starting training...
Epoch 50 of 1500 took 0.016s
	training loss:			1.494263
	validation loss:		1.707279
	validation accuracy:		30.56 %
Epoch 100 of 1500 took 0.017s
	training loss:			0.999014
	validation loss:		1.786033
	validation accuracy:		35.00 %
Epoch 150 of 1500 took 0.018s
	training loss:			0.558275
	validation loss:		1.742010
	validation accuracy:		43.33 %
Epoch 200 of 1500 took 0.018s
	training loss:			0.288762
	validation loss:		2.017512
	validation accuracy:		46.67 %
Epoch 250 of 1500 took 0.016s
	training loss:			0.126272
	validation loss:		2.154541
	validation accuracy:		48.33 %
Epoch 300 of 1500 took 0.016s
	training loss:			0.069047
	validation loss:		2.356900
	validation accuracy:		48.89 %
Epoch 350 of 1500 took 0.017s
	training loss:			0.037791
	valid

In [8]:
nn_all

Unnamed: 0,All speakers
log_fb_en_25_10_ham_deltas_norm,10.0
log_fb_en_25_10_ham_norm,91.111111
mfcc_25_10_ham_deltas_norm,10.0
mfcc_25_10_ham_norm,65.0
ste_10_10_norm,52.222222
ste_sti_stzcr_10_10_norm,77.222222
sti_10_10_norm,57.222222
stzcr_10_10_norm,58.888889


In [9]:
csv_name = os.path.join(results_dir, 'nn_all_speakers.csv')

with open(csv_name, 'w') as fw:
    nn_all.to_csv(fw)

## All test data against one speaker

In [10]:
accuracy_data = {}

for feature in features:
    print('{0: <15}{1}\n'.format('Features used:', feature))
    distance_matrix, ref_keys_all, test_keys_all = common.get_data(dm_dir, feature)
    
    feature_accuracies = []
    test_keys = []  
    
    for speaker in speakers:
        test_keys.extend(common.get_speaker_keys(test_keys_all, speaker))
    
    for speaker in speakers:
        print('{0: <12}{1}'.format('Speaker:', speaker))
        ref_keys = common.get_speaker_keys(ref_keys_all, speaker)
        
        X_train, y_train, X_val, y_val, X_test, y_test = nn.prepare_dataset(ref_keys, ref_keys, test_keys, test_keys, distance_matrix, references)
        
        input_dim = X_train.shape[1]
        output_dim = y_train.shape[1]
        
        params_file = os.path.join(params_dir, feature + '_' + speaker + '.npz')
        try:
            with np.load(params_file) as fr:
                param_values = [fr['arr_%d' % i] for i in range(len(fr.files))]
            print 'Loading model parameters...'
            
            network = nn.build_mlp(input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            lasagne.layers.set_all_param_values(network, param_values)
        except IOError:
            print 'Starting training...'
            network = nn.train(num_epochs, batch_count, X_train, y_train, X_val, y_val,
                               input_dim, output_dim, depth, num_units, drop_input, drop_hidden)
            if save_params:
                np.savez(params_file, *lasagne.layers.get_all_param_values(network))
        
        print '\nComputing recognition accuracy...\n'
        recognition = nn.recognize(X_test, network)
        accuracy = nn.compute_accuracy(y_test, recognition)
        feature_accuracies.append(accuracy)
        
    feature_accuracies.append(np.mean(feature_accuracies))
    accuracy_data[feature] = feature_accuracies
    
nn_all_test_per_speaker = common.create_dataframe(accuracy_data, speakers + ['Mean'])

Features used: ste_10_10_norm

Speaker:    1
X_train:    (10, 10).float64        y_train:    (10, 10).int32
X_val:     (180, 10).float64        y_val:     (180, 10).int32
X_test:    (180, 10).float64        y_test:    (180, 10).int32

Starting training...
Epoch 50 of 1500 took 0.007s
	training loss:			2.214124
	validation loss:		2.272501
	validation accuracy:		17.22 %
Epoch 100 of 1500 took 0.007s
	training loss:			2.087804
	validation loss:		2.229533
	validation accuracy:		22.22 %
Epoch 150 of 1500 took 0.006s
	training loss:			1.834131
	validation loss:		2.158651
	validation accuracy:		21.11 %
Epoch 200 of 1500 took 0.006s
	training loss:			1.398926
	validation loss:		2.109506
	validation accuracy:		19.44 %
Epoch 250 of 1500 took 0.006s
	training loss:			0.971288
	validation loss:		2.192017
	validation accuracy:		18.33 %
Epoch 300 of 1500 took 0.006s
	training loss:			0.686128
	validation loss:		2.405707
	validation accuracy:		21.11 %
Epoch 350 of 1500 took 0.007s
	training loss:			0

In [11]:
nn_all_test_per_speaker

Unnamed: 0,1,2,3,4,5,6,Mean
log_fb_en_25_10_ham_deltas_norm,57.777778,49.444444,53.333333,57.222222,42.222222,48.888889,51.481481
log_fb_en_25_10_ham_norm,70.0,74.444444,80.0,74.444444,80.0,85.555556,77.407407
mfcc_25_10_ham_deltas_norm,63.888889,58.888889,63.888889,54.444444,62.777778,67.222222,61.851852
mfcc_25_10_ham_norm,68.333333,72.777778,68.333333,66.666667,68.888889,78.333333,70.555556
ste_10_10_norm,23.333333,19.444444,26.111111,28.333333,22.222222,25.555556,24.166667
ste_sti_stzcr_10_10_norm,56.666667,46.666667,64.444444,58.333333,58.333333,66.111111,58.425926
sti_10_10_norm,27.222222,33.888889,30.0,33.888889,27.777778,35.555556,31.388889
stzcr_10_10_norm,45.555556,40.555556,45.555556,45.555556,40.555556,44.444444,43.703704


In [12]:
csv_name = os.path.join(results_dir, 'nn_all_test_per_speaker.csv')

with open(csv_name, 'w') as fw:
    nn_all_test_per_speaker.to_csv(fw)