In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import librosa
import scipy as sp
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras import Model

In [2]:
n_mels = 80
fs = 22050
frame_length_ms=50
frame_shift_ms=12.5
nsc = int(22050 * frame_length_ms / 1000)
nov = nsc - int(22050 * frame_shift_ms / 1000)
nhop = int(22050 * frame_shift_ms / 1000)
eps = 1e-10
db_ref = 100

chars = ' ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;?'

num_tokens = len(chars)

embed_size = 256

K = 16

num_conv1d_filters = 128

prenet_size = [256, 128]


In [3]:
meta_path = "../datasets/metadata.csv"

with open(meta_path, encoding='utf-8') as f:
    metadata = np.array([line.strip().split('|') for line in f])
#     hours = sum((int(x[2]) for x in metadata)) * frame_shift_ms / (3600 * 1000)
#     log('Loaded metadata for %d examples (%.2f hours)' % (len(metadata), hours))

metadata = metadata[:32, :2]

In [4]:
wave_name_list = []

for data in metadata:
    wav_name = '{}.wav'.format(data[0])
    wave_name_list.append(wav_name)

In [5]:
"""

Preprocessing Step

"""

data_folder = "../datasets/wavs"
specgram_folder = "../datasets/specgrams"
mel_folder = "../datasets/mels"

for wav_name in tqdm(wave_name_list):
    wav_path = os.path.join(data_folder, wav_name)
    
    npy_name = wav_name.replace('.wav', '.npy')
    
    specgram_path = os.path.join(specgram_folder, npy_name)
    mel_path = os.path.join(mel_folder, npy_name)
    
    y, sr = librosa.core.load(wav_path)
    
    f, t, Zxx = sp.signal.stft(y, fs=sr, nperseg=nsc, noverlap=nov)

    Sxx = np.abs(Zxx)
    Sxx = np.maximum(Sxx, eps)

    # plt.figure(figsize=(20,20))
    # plt.imshow(20*np.log10(Sxx), origin='lower')
    # plt.colorbar()
    # plt.show()

    mel_filters = librosa.filters.mel(sr=fs, n_fft=nsc, n_mels=n_mels)

    mel_specgram = np.matmul(mel_filters, Sxx)

    log_specgram = 20*np.log10(Sxx)

    norm_log_specgram = (log_specgram + db_ref) / db_ref

    log_mel_specgram = 20 * np.log10(np.maximum(mel_specgram, eps))

    norm_log_mel_specgram = (log_mel_specgram + db_ref) / db_ref
    
    np.save(specgram_path, norm_log_specgram)
    np.save(mel_path, norm_log_mel_specgram)
    
#     print(norm_log_mel_specgram.shape[1])
    

#     plt.figure(figsize=(16,9))
#     plt.imshow(norm_log_specgram, origin='lower', aspect='auto')
#     plt.colorbar()
#     plt.show()

#     plt.figure(figsize=(16,9))
#     plt.imshow(norm_log_mel_specgram, origin='lower', aspect='auto')
#     plt.colorbar()
#     plt.show()    

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




In [6]:
batch_size = 4

In [7]:
def text2token(text):
    text_len = len(text)
    num_list = -1 * np.ones(text_len)
    
    for i, char in enumerate(text):
        num_list[i] = chars.find(char)
        
    assert len(num_list) == text_len, "Tokenization Failed"
        
    return num_list

In [8]:
tokenized_list = []

for i in range(batch_size):
    text = metadata[i, 1]
    text_tokenized = text2token(text)
    tokenized_list.append(text_tokenized)
    
max_token_len = max([len(tokenized) for tokenized in tokenized_list])
batched_token = np.zeros((batch_size, max_token_len))

for i in range(batch_size):  
    tokenized_len = len(tokenized_list[i])
    batched_token[i, :tokenized_len] = tokenized_list[i]

In [9]:
batched_token.shape

(4, 155)

In [10]:
inputs = tf.keras.Input(shape=(None, ))
print(inputs.shape)

(None, None)


In [11]:
embedding_layer = tf.keras.layers.Embedding(num_tokens, embed_size)

In [12]:
encoder_prenet_layer_list = [tf.keras.layers.Dense(prenet_size[0], activation='relu', input_shape=(None, embed_size)),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(prenet_size[1], activation='relu'),
                             tf.keras.layers.Dropout(0.5)]

encoder_prenet = tf.keras.Sequential(encoder_prenet_layer_list, name='Encoder_Prenet')

In [13]:
x = embedding_layer(inputs)
print(x.shape)
x = encoder_prenet(x)
print(x.shape)

(None, None, 256)
(None, None, 128)


In [19]:
conv1d_filter_list = [tf.keras.layers.Conv1D(kernel_size=k+1,
                                filters=128,
                                activation='relu', 
                                padding='same',
                                input_shape=(None, 128)) for k in range(K)]

In [20]:
intermediate_results = []

for conv_k in conv1d_filter_list:
    conv_k_result = conv_k(x)
    intermediate_results.append(conv_k_result)
    
outputs = tf.concat(intermediate_results, axis=-1)

In [22]:
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [23]:
model(batched_token).shape

TensorShape([4, 155, 2048])

In [21]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 256)         16128     
_________________________________________________________________
Encoder_Prenet (Sequential)  (None, None, 128)         98688     
Total params: 114,816
Trainable params: 114,816
Non-trainable params: 0
_________________________________________________________________


In [24]:
class ConvFilterBank(tf.keras.layers.Layer):
    def __init__(self):
        super(ConvFilterBank, self).__init__()
#         self.num_outputs = np.sum([128 * k for k in range(K)])

    def build(self, input_shape):
        self.conv1d_filter_list = [tf.keras.layers.Conv1D(kernel_size=k+1,
                                filters=128,
                                activation='relu', 
                                padding='same',
                                input_shape=(None, 128)) for k in range(K)]

    def call(self, input_):
        intermediate_results = []
        
        for k in range(K):
            conv_k_result = self.conv1d_filter_list[k](input_)
            intermediate_results.append(conv_k_result)
        
        output = tf.concat(intermediate_results, axis = -1) 
        return output
#         return tf.concat([self.conv1d_filter_list[k](input_) for k in range(K)], axis=-1)
#         return tf.concat([self.conv1d_filter_list[k](input_) ], axis=-1)

In [25]:
conv1d_filter_bank = ConvFilterBank()

In [26]:
stacked_conv1d = tf.keras.Sequential([embedding_layer, encoder_prenet, conv1d_filter_bank])

In [27]:
embedded_batch = embedding_layer(batched_token)
print(embedded_batch.shape)
prenet_result = encoder_prenet(embedded_batch)
print(prenet_result.shape)

(4, 155, 256)
(4, 155, 128)


In [28]:
filter_bank_result = conv1d_filter_bank(prenet_result)

In [29]:
filter_bank_result.shape

TensorShape([4, 155, 2048])

In [None]:
stacked_conv1d.summary()

In [None]:
filter_bank_result = stacked_conv1d(batched_token)

In [None]:
conv1d_filter_banks

In [None]:
conv_result = conv1d(encoded_result)
conv_result.shape

In [None]:
class ConvFilterBanks(Model):
    def __init__(self):
        super(ConvFilterBanks, self).__init__()
        self.filters = [tf.keras.layers.Conv1D(kernel_size = k, 
                                              filters = num_conv1d_filters,
                                              activation='relu',
                                              input_shape=(None, embed_size))
                                              for k in range(K)]

    def call(self, x):
        outputs = []
        for i, layer in enumerate(self.filters):
            outputs.append(self.filters[i](x))
            
        output = tf.concat(outputs, 0)

        return output
    
conv1d_banks = ConvFilterBanks()

model = tf.keras.Sequential([encoder_prenet, conv1d_banks])

In [None]:
model(embedded_batch)

In [None]:
class ConvFilterBanks(tf.keras.layers.Layer):
    def __init__(self):
        super(ConvFilterBanks, self).__init__()
        self.layers = [tf.keras.layers.Conv1D(kernel_size = k, 
                                              filters = num_conv1d_filters,
                                              activation='relu',
                                              input_shape=(None, embed_size)) for k in range(K)]
        self.num_outputs = [None, None, embed_size * K * num_conv1d_filters]
    
    def build(self, input_shape):
        self.kernel = self.add_variable("kernel", 
                                        shape=[int(input_shape[-1]), 
                                               self.num_outputs])
    
    def call(self, input):

        for i, layer in enumerate(layers):
            if i == 0:
                output = self.layers[i](input)
            else:
                output = tf.concat(output, self.layers[i](input))
        return 

In [None]:
layer = ConvFilterBanks()

In [None]:
conv_bank_list 

In [None]:
# def create_model():
#     IMG_SHAPE = (256, 256, 3)
#     img_inputs = tf.keras.Input(shape=IMG_SHAPE)
#     conv_1 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu')(img_inputs)
#     maxpool_1 = tf.keras.layers.MaxPooling2D((2, 2))(conv_1)
#     conv_2 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu')(maxpool_1)
#     maxpool_2 = tf.keras.layers.MaxPooling2D((2, 2))(conv_2)
#     conv_3 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu')(maxpool_2)
#     flatten = tf.keras.layers.Flatten()(conv_3)
#     dense_1 = tf.keras.layers.Dense(64, activation='relu')(flatten)
#     output = tf.keras.layers.Dense(10, activation='softmax')(dense_1)

#     model = tf.keras.Model(inputs=img_inputs, outputs=output)
    
#     return model

In [None]:
casual_conv_layers = []

for i in range(0, 10):
    
    if i == 0:
        conv1d = tf.keras.layers.Conv1D(filters=1, kernel_size=2, dilation_rate=2**i, padding='same', input_shape=[2**10, 1],
        activation="softmax",use_bias=True)
    else:
        conv1d = tf.keras.layers.Conv1D(filters=1, kernel_size=2, dilation_rate=2**i, padding='same', input_shape=[2**10, 1],
        activation="softmax",use_bias=True)

    casual_conv_layers.append(conv1d)


In [None]:
model = tf.keras.Sequential(casual_conv_layers)
# model.add(casual_conv_layers)
model.summary()

In [None]:
x_train = np.asarray(np.random.random(4 * 1024).reshape(4, 1024, 1).astype(np.float32))
y = model(x_train)

In [None]:
y.shape