In [1]:
import os
import glob
from os.path import join
import librosa
import numpy as np
import pyworld as pw
import tensorflow as tf

In [2]:
args = tf.app.flags.FLAGS

In [3]:
name_lists = ['lewis', 'kevin', 'gold']
data_path = join(".", "data")

sampling_rate = 16000
num_mcep = 24
frame_period = 5.0
n_frames = 32 #org_value 128
lambda_cycle = 10
lambda_identity = 5
learning_rate = 0.05

In [4]:
def load_vocab():
    name2id = {char: idx for idx, char in enumerate(name_lists)}
    id2name = {idx: char for idx, char in enumerate(name_lists)}
    
    return name2id, id2name

In [5]:
name2id, id2name = load_vocab()

In [6]:
x = list()
y = list()

In [7]:
def world_decompose(wav, fs, frame_period = 5.0):

    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pw.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pw.cheaptrick(wav, f0, timeaxis, fs)
    ap = pw.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap

In [8]:
def world_encode_spectral_envelop(sp, fs, dim = 24):

    # Get Mel-cepstral coefficients (MCEPs)

    #sp = sp.astype(np.float64)
    coded_sp = pw.code_spectral_envelope(sp, fs, dim)

    return coded_sp

In [9]:
def logf0_statistics(f0s):

    log_f0s_concatenated = np.ma.log(np.concatenate(f0s))
    log_f0s_mean = log_f0s_concatenated.mean()
    log_f0s_std = log_f0s_concatenated.std()

    return log_f0s_mean, log_f0s_std

In [10]:
def coded_sps_normalization_fit_transoform(coded_sps):

    coded_sps_concatenated = np.concatenate(coded_sps, axis = 1)
    coded_sps_mean = np.mean(coded_sps_concatenated, axis = 1, keepdims = True)
    coded_sps_std = np.std(coded_sps_concatenated, axis = 1, keepdims = True)

    coded_sps_normalized = list()
    for coded_sp in coded_sps:
        coded_sps_normalized.append((coded_sp - coded_sps_mean) / coded_sps_std)
    
    return coded_sps_normalized, coded_sps_mean, coded_sps_std

In [11]:
def transpose_in_list(lst):

    transposed_lst = list()
    for array in lst:
        transposed_lst.append(array.T)
    return transposed_lst

In [12]:
for name in name_lists:
    path = join(data_path, name)
    wave_file_names = [f for f in os.listdir(path) if f.endswith(".wav")]
    
    for wave_file_name in wave_file_names:
        path = join(data_path, name, wave_file_name)
        
        wav, _ = librosa.load(path, sr = sampling_rate, mono = True)
        
        f0, timeaxis, sp, ap = world_decompose(wav, sampling_rate)

        tmp_y = np.zeros(len(name_lists))
        
        tmp_y[name2id[name]] = 1
        
        for i in range(len(ap) - n_frames):
            tmp_x = sp[i:i + n_frames + 1, :]
            tmp_x = world_encode_spectral_envelop(tmp_x, sampling_rate).T
            
            x.append(tmp_x)
            y.append(tmp_y)

KeyboardInterrupt: 

In [13]:
from scipy.misc import toimage

for i in range(10):
    toimage(x[i]).show()

print(sp.shape)
print(ap.shape)

`toimage` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use Pillow's ``Image.fromarray`` directly instead.
  after removing the cwd from sys.path.


(735, 513)
(735, 513)


In [19]:
np.save("x.npy", x)

In [20]:
np.save("y.npy", y)

In [12]:
#load cache data
x, y = np.load("x.npy"), np.load("y.npy")

In [13]:
x.shape, y.shape

((747203, 24, 33), (747203, 3))

In [14]:
x = x.reshape(len(x), 24 * 33)

In [15]:
input_X = tf.placeholder(tf.float32, [None, 24 * 33])

In [16]:
input_Y = tf.placeholder(tf.float32, [None, 3])

In [17]:
layer_size_1 = 28
layer_size_2 = 14
layer_size_3 = 3

weight_1 = tf.Variable(tf.truncated_normal([24 * 33, layer_size_1]))
weight_2 = tf.Variable(tf.truncated_normal([layer_size_1, layer_size_2]))
weight_3 = tf.Variable(tf.truncated_normal([layer_size_2, layer_size_3]))

biases_1 = tf.Variable(tf.truncated_normal([layer_size_1]))
biases_2 = tf.Variable(tf.truncated_normal([layer_size_2]))
biases_3 = tf.Variable(tf.truncated_normal([layer_size_3]))

layer_1 = tf.add(tf.matmul(input_X, weight_1), biases_1)
layer_1 = tf.nn.sigmoid(layer_1)

layer_2 = tf.add(tf.matmul(layer_1, weight_2), biases_2)
layer_2 = tf.nn.sigmoid(layer_2)

layer_3 = tf.add(tf.matmul(layer_2, weight_3), biases_3)
layer_3 = tf.nn.sigmoid(layer_3)

In [18]:
_f = layer_3
f = input_Y

In [19]:
#loss = tf.reduce_mean(tf.pow(f - _f, 2))
#optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=_f, labels=f))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

W0603 01:00:02.388820  3780 deprecation.py:323] From <ipython-input-19-be715e9a21ba>:4: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [20]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [21]:
feed_indexs = np.arange(len(x))

In [22]:
x[feed_indexs[0:5000]].shape

(5000, 792)

In [44]:
for i in range(1000):
    np.random.shuffle(feed_indexs)
    sess.run(optimizer, feed_dict={input_X: x[feed_indexs[0:5000]], input_Y: y[feed_indexs[0:5000]]})
    
    if i % 100 == 0:
        print (sess.run(loss, feed_dict={input_X: [x[feed_indexs[0]]], input_Y: [y[feed_indexs[0]]]}))

0.55144656
0.5515969
0.551453
0.55144644
0.55149627
0.55144554
0.5514911
1.5514435
0.5515671
0.55253834


In [45]:
from random import randint

for i in range(10):
    random_pick = randint(0, len(x))

    print (np.argmax(sess.run(layer_3, feed_dict={input_X: [x[random_pick]], input_Y: [y[random_pick]]})) == np.argmax(y[random_pick]))

False
True
True
False
True
True
True
False
True
True
