In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import librosa
import scipy as sp
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras import Model

In [2]:
n_mels = 80
fs = 22050
frame_length_ms=50
frame_shift_ms=12.5
nsc = int(22050 * frame_length_ms / 1000)
nov = nsc - int(22050 * frame_shift_ms / 1000)
nhop = int(22050 * frame_shift_ms / 1000)
eps = 1e-10
db_ref = 100

chars = ' ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;?'

num_tokens = len(chars)

embed_size = 256

K = 16

num_conv1d_filters = 128

prenet_size = [256, 128]

conv_proj_size = [128, 128]

attention_size = 256


In [4]:
meta_path = "../datasets/metadata.csv"

with open(meta_path, encoding='utf-8') as f:
    metadata = np.array([line.strip().split('|') for line in f])
#     hours = sum((int(x[2]) for x in metadata)) * frame_shift_ms / (3600 * 1000)
#     log('Loaded metadata for %d examples (%.2f hours)' % (len(metadata), hours))

metadata = metadata[:32, :2]

In [5]:
wave_name_list = []

for data in metadata:
    wav_name = '{}.wav'.format(data[0])
    wave_name_list.append(wav_name)

In [1]:
tf.keras.preprocessing.sequence.pad_sequences

NameError: name 'tf' is not defined

In [6]:
"""

Preprocessing Step

"""

data_folder = "../datasets/wavs"
specgram_folder = "../datasets/specgrams"
mel_folder = "../datasets/mels"

for wav_name in tqdm(wave_name_list):
    wav_path = os.path.join(data_folder, wav_name)
    
    npy_name = wav_name.replace('.wav', '.npy')
    
    specgram_path = os.path.join(specgram_folder, npy_name)
    mel_path = os.path.join(mel_folder, npy_name)
    
    y, sr = librosa.core.load(wav_path)
    
    f, t, Zxx = sp.signal.stft(y, fs=sr, nperseg=nsc, noverlap=nov)

    Sxx = np.abs(Zxx)
    Sxx = np.maximum(Sxx, eps)

    # plt.figure(figsize=(20,20))
    # plt.imshow(20*np.log10(Sxx), origin='lower')
    # plt.colorbar()
    # plt.show()

    mel_filters = librosa.filters.mel(sr=fs, n_fft=nsc, n_mels=n_mels)

    mel_specgram = np.matmul(mel_filters, Sxx)

    log_specgram = 20*np.log10(Sxx)

    norm_log_specgram = (log_specgram + db_ref) / db_ref

    log_mel_specgram = 20 * np.log10(np.maximum(mel_specgram, eps))

    norm_log_mel_specgram = (log_mel_specgram + db_ref) / db_ref
    
    np.save(specgram_path, norm_log_specgram)
    np.save(mel_path, norm_log_mel_specgram)
    
#     print(norm_log_mel_specgram.shape[1])
    

#     plt.figure(figsize=(16,9))
#     plt.imshow(norm_log_specgram, origin='lower', aspect='auto')
#     plt.colorbar()
#     plt.show()

#     plt.figure(figsize=(16,9))
#     plt.imshow(norm_log_mel_specgram, origin='lower', aspect='auto')
#     plt.colorbar()
#     plt.show()    

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




In [7]:
batch_size = 4

In [8]:
def text2token(text):
    text_len = len(text)
    num_list = -1 * np.ones(text_len)
    
    for i, char in enumerate(text):
        num_list[i] = chars.find(char)
        
    assert len(num_list) == text_len, "Tokenization Failed"
        
    return num_list

In [9]:
tokenized_list = []

for i in range(batch_size):
    text = metadata[i, 1]
    text_tokenized = text2token(text)
    tokenized_list.append(text_tokenized)
    
max_token_len = max([len(tokenized) for tokenized in tokenized_list])
batched_token = np.zeros((batch_size, max_token_len))

for i in range(batch_size):  
    tokenized_len = len(tokenized_list[i])
    batched_token[i, :tokenized_len] = tokenized_list[i]

In [10]:
input_length = max_token_len

In [11]:
batched_token.shape

(4, 155)

In [12]:
inputs = tf.keras.Input(shape=(None, ))
print(inputs.shape)

(None, None)


In [13]:
embedding_layer = tf.keras.layers.Embedding(num_tokens, embed_size)

In [14]:
encoder_prenet_layer_list = [tf.keras.layers.Dense(prenet_size[0], activation='relu', input_shape=(None, embed_size)),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(prenet_size[1], activation='relu'),
                             tf.keras.layers.Dropout(0.5)]

encoder_prenet = tf.keras.Sequential(encoder_prenet_layer_list, name='encoder_prenet')

In [15]:
# conv1d_filter_list = [tf.keras.layers.Conv1D(kernel_size=k+1,
#                                 filters=128,
#                                 activation='relu', 
#                                 padding='same',
#                                 input_shape=(None, 128)) for k in range(K)]

In [16]:
# intermediate_results = []

# for conv_k in conv1d_filter_list:
#     conv_k_result = conv_k(x)
#     intermediate_results.append(conv_k_result)
    
# outputs = tf.concat(intermediate_results, axis=-1)

In [17]:
# model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [18]:
# model(batched_token).shape

In [19]:
class ConvFilterBank(tf.keras.layers.Layer):
    def __init__(self):
        super(ConvFilterBank, self).__init__()
        self.num_outputs = num_conv1d_filters * K

    def build(self, input_shape):
        self.conv1d_filter_list = [tf.keras.layers.Conv1D(kernel_size=k+1,
                                filters=num_conv1d_filters,
                                activation='relu', 
                                padding='same',
                                input_shape=(None, 128)) for k in range(K)]
        self.batch_norm_list = [tf.keras.layers.BatchNormalization(trainable=True) 
                                for k in range(K)]

    def call(self, input_):
        intermediate_results = []
        
        for k in range(K):
            conv_k_result = self.conv1d_filter_list[k](input_)
            output = self.batch_norm_list[k](conv_k_result)
            intermediate_results.append(output)
        
        outputs = tf.concat(intermediate_results, axis = -1) 
        return outputs

In [20]:
conv1d_filter_bank = ConvFilterBank()

In [21]:
stacked_conv1d = tf.keras.Sequential([embedding_layer, encoder_prenet, conv1d_filter_bank])

In [22]:
stacked_conv1d.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         16128     
_________________________________________________________________
encoder_prenet (Sequential)  (None, None, 128)         98688     
_________________________________________________________________
conv_filter_bank (ConvFilter (None, None, 2048)        2238464   
Total params: 2,353,280
Trainable params: 2,349,184
Non-trainable params: 4,096
_________________________________________________________________


In [23]:
embedded_batch = embedding_layer(batched_token)
print(embedded_batch.shape)
prenet_result = encoder_prenet(embedded_batch)
print(prenet_result.shape)

(4, 155, 256)
(4, 155, 128)


In [24]:
filter_bank_result = stacked_conv1d(batched_token)

In [25]:
filter_bank_result.shape

TensorShape([4, 155, 2048])

In [26]:
tf.keras.layers.MaxPool1D()

<tensorflow.python.keras.layers.pooling.MaxPooling1D at 0x1d92fbc9898>

In [27]:
x = embedding_layer(inputs)
print(x.shape)
x = encoder_prenet(x)
print(x.shape)
x = conv1d_filter_bank(x)
print(x)

(None, None, 256)
(None, None, 128)
Tensor("conv_filter_bank_1/Identity:0", shape=(None, None, 2048), dtype=float32)


In [28]:
pooling_layer = tf.keras.layers.MaxPool1D(pool_size=2, strides=1, padding='same')

In [29]:
pooling_result = pooling_layer(filter_bank_result)

In [30]:
pooling_result.shape

TensorShape([4, 155, 2048])

In [31]:
conv_proj_layer_list = [tf.keras.layers.Conv1D(kernel_size=3,
                                              filters=conv_proj_size[0], 
                                              activation='relu',
                                              padding='same',
                                              input_shape=(None, num_conv1d_filters * K)),
                        tf.keras.layers.BatchNormalization(trainable=True),
                        tf.keras.layers.Conv1D(kernel_size=3,
                                               filters=conv_proj_size[1],
                                               padding='same')]

conv_proj_layer = tf.keras.Sequential(conv_proj_layer_list, name='conv_proj_layer')

In [32]:
conv_proj_layer(pooling_result).shape

TensorShape([4, 155, 128])

In [33]:
conv_proj_layer.summary()

Model: "conv_proj_layer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, None, 128)         786560    
_________________________________________________________________
batch_normalization (BatchNo (None, None, 128)         512       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         49280     
Total params: 836,352
Trainable params: 836,096
Non-trainable params: 256
_________________________________________________________________


In [34]:
inputs = tf.keras.Input(shape=(None, ))
print(inputs.shape)
x = embedding_layer(inputs)
print(x.shape)
encoder_outputs = encoder_prenet(x)
print(encoder_outputs.shape)
x = conv1d_filter_bank(encoder_outputs)
print(x.shape)
x = pooling_layer(x)
print(x.shape)
conv_proj_outputs = conv_proj_layer(x)
print(conv_proj_outputs.shape)

(None, None)
(None, None, 256)
(None, None, 128)
(None, None, 2048)
(None, None, 2048)
(None, None, 128)


In [35]:
residual_result = tf.add(encoder_outputs, conv_proj_outputs, name='residual_connection')

In [36]:
class HighwayLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(HighwayLayer, self).__init__()

    def build(self, input_shape):
        self.T =  tf.keras.layers.Dense(input_shape=[None, None, conv_proj_size[-1]],
                                        units=conv_proj_size[-1],
                                        activation='sigmoid',
                                        name='T', 
                                        bias_initializer=tf.constant_initializer(-1.0))
        self.H = tf.keras.layers.Dense(input_shape=[None, None, conv_proj_size[-1]],
                                       units=conv_proj_size[-1], 
                                       activation='relu',
                                       name='H')

    def call(self, input_):
        outputs = self.H(input_) * self.T(input_) + input_ * (1 - self.T(input_))
        return outputs

In [37]:
highway_layer_list = [HighwayLayer(),
                      HighwayLayer(),
                      HighwayLayer(),
                      HighwayLayer()]

highway_network = tf.keras.Sequential(highway_layer_list, name='highway_layer')

In [38]:
highway_result = highway_network(residual_result)

In [39]:
model = tf.keras.Model(inputs=inputs, outputs=highway_result)

In [40]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    16128       input_2[0][0]                    
__________________________________________________________________________________________________
encoder_prenet (Sequential)     (None, None, 128)    98688       embedding[2][0]                  
__________________________________________________________________________________________________
conv_filter_bank (ConvFilterBan (None, None, 2048)   2238464     encoder_prenet[3][0]             
______________________________________________________________________________________________

In [45]:
model(batched_token).shape

TensorShape([4, 155, 128])

In [80]:
bidirec_rnn_layer = tf.keras.layers.Bidirectional(
                      tf.keras.layers.GRU(128, 
                                          return_sequences=True,
                                          return_state=False),
                      merge_mode="concat")

In [53]:
# rnn_outputs = bidirec_rnn_layer(highway_result)

In [81]:
def initialize_hidden_state(batch_size, enc_units):
    return [tf.zeros((batch_size, enc_units)), tf.zeros((batch_size, enc_units))]

In [82]:
h = initialize_hidden_state(4, 128)

In [83]:
rnn_outputs = bidirec_rnn_layer(highway_result, initial_state = h)

In [84]:
rnn_outputs

<tf.Tensor 'bidirectional_5/Identity:0' shape=(4, None, 256) dtype=float32>

In [87]:
last_hidden = rnn_outputs[:, -1, :]

In [75]:
model = tf.keras.Model(inputs=inputs, outputs=rnn_outputs)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    16128       input_2[0][0]                    
__________________________________________________________________________________________________
encoder_prenet (Sequential)     (None, None, 128)    98688       embedding[2][0]                  
__________________________________________________________________________________________________
conv_filter_bank (ConvFilterBan (None, None, 2048)   2238464     encoder_prenet[3][0]             
____________________________________________________________________________________________

In [56]:
result = model(batched_token)

In [58]:
result.shape

TensorShape([4, 155, 256])

In [88]:
tf.expand_dims(result[1], 1).shape

TensorShape([155, 1, 256])

In [93]:
"""
https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention

"""

class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
class Decoder(tf.keras.Model):
    
class DecoderPrenet(tf.keras.Model):
    

In [None]:
rnn_results = model(batched_token)

In [None]:
attention_layer = BahdanauAttention(attention_size)

In [None]:
outputs = attention_layer(states, rnn_outputs)

In [None]:
outputs = attention_layer(rnn_states, rnn_results)

In [None]:
outputs[0].shape

In [None]:
outputs[1].shape

In [None]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
model(batched_token)

In [None]:
model(batched_token)[2]