# Requirements

Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.

The following python packages are required:
- lasagne
- matplotlib
- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)
- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)

In [None]:
%load_ext autoreload
% autoreload 2
#%matplotlib inline
# %env CUDA_VISIBLE_DEVICES="1"
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile
from sphfile import SPHFile
from python_speech_features import mfcc
import tensorflow as tf
import keras as K
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Concatenate, Layer

# Prepare dataset

In [None]:
if not os.path.isdir("data/lisa/data/timit/raw/TIMIT"):
    assert os.path.exists("TIMIT.zip"), "Missing data archive"
    with ZipFile("TIMIT.zip", 'r') as f:
        f.extractall(path=".")

In [None]:
files = []
train_subset = []

for dirpath, _, filenames in os.walk("data/lisa/data/timit/raw/TIMIT"):
    for f in filenames:
        if f.endswith("WAV"):
            recording = SPHFile(dirpath + "/" + f).content
            files.append(dirpath + "/" + f[:-4])
            train_subset.append(dirpath[31:36] == "TRAIN")

files = np.array(files)
train_subset = np.array(train_subset, dtype=np.bool)

# Preprocessing

In [None]:
if not os.path.exists("preprocessed_dataset.pkl"):
    features = []
    labels = []

    for f in files:
        recording = SPHFile(f + ".WAV")
        signal = recording.content
        samplerate = recording.format['sample_rate']

        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, 
                         numcep=13, nfilt=26, appendEnergy=True)
        derivatives = np.concatenate([
            mfccfeats[1, None] - mfccfeats[0, None],
            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],
            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)

        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))

        with open(f + ".PHN") as phonem_file:
            labels.append([l.split()[2] for l in phonem_file.readlines()])

    m = np.mean(np.concatenate(features, axis=0))
    s = np.std(np.concatenate(features, axis=0))

    for i in range(len(features)):
        features[i] = (features[i] - m) / s

    vocabulary = set()
    for lseq in labels:
        vocabulary |= set(lseq)

    vocabulary = list(vocabulary)
    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]

    for i in range(len(labels)):
        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)

    blank = 60
    
    with open("preprocessed_dataset.pkl", 'wb') as f:
        pkl.dump((features, labels, vocabulary, blank), f)


with open("preprocessed_dataset.pkl", 'rb') as f:
    features, labels, vocabulary, blank = pkl.load(f)

In [None]:
plt.figure(figsize=(20, 9))
plt.imshow(features[1].transpose(), clim=(-4, 4))
plt.show()

# Model

In [None]:
def zero_loss(y_true, y_pred):
    return K.backend.zeros_like(y_pred)

def dense_to_sparse(x):
    idx = tf.where(tf.greater_equal(x, 0))
    return tf.SparseTensor(idx, tf.gather_nd(x, idx), tf.shape(x, out_type=tf.int64))

class CTCLossLayer(Layer):
    def __init__(self, **kwargs):
        super(CTCLossLayer, self).__init__(**kwargs)

    def call(self, x, mask=None):
        linout = x[0]
        targets = x[1]
        durations = x[2]
        loss = tf.nn.ctc_loss(
            dense_to_sparse(targets), linout,
            sequence_length=durations[:, 0],
            time_major=False)
        self.add_loss(tf.reduce_sum(loss), x)
        return loss

    def compute_output_shape(self, input_shape):
        return input_shape[0][0]

a = Input(shape=(None, features[0].shape[1]), name="features")
targets = Input(shape=[None], dtype='int32', name="targets")
durations = Input(shape=[1], dtype='int32', name="durations")
b1 = LSTM(100, return_sequences=True)(a)
b2 = LSTM(100, return_sequences=True, go_backwards=True)(a)
c = Concatenate(axis=2)([b1, b2])
d = Dense(len(vocabulary), activation=None)(c)
l = CTCLossLayer()([d, targets, durations])
model = Model(inputs=[a, targets, durations], outputs=[d, l])
sgd = K.optimizers.SGD(lr=1e-4, momentum=0.9, nesterov=True)

model.summary()

model.compile(
    target_tensors=[targets, targets], 
    loss=[zero_loss, zero_loss], 
    optimizer=sgd)

# Training

In [None]:
# lasagne.layers.set_all_param_values(l_linout, params_backup[0])

params_backup = []
running_loss = None

for i in np.random.permutation(len(labels))[:300]:
    f, l  = features[i][None, :, :], labels[i][None, 1:-1]

    batch_loss = model.train_on_batch(
        x=[f, l, np.array([f.shape[1]], np.int32)],
        y=[l, l])[0]

    if batch_loss > 10000:
        print("\nskipped i = {}".format(i))
        continue
    else:
        running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss
        print("\rloss = {:>5.0f}".format(running_loss), end='', flush=True)

# Evaluate model

In [None]:
def argmax_decode(preds):
    decoded = [preds[0]]
    for v in preds:
        if v != decoded[-1]:
            decoded.append(v)
    
    return np.array(decoded, dtype=np.int32)

In [None]:
features[i].shape

In [None]:
i = 0
inputs = [features[i][None, :, :], labels[i][None, 1:-1], np.array([features[i].shape[0]], np.int32)]
logits = model.predict(inputs)[0][0]
# preds -= np.max(preds, axis=1, keepdims=True)
# preds = np.exp(preds)
# preds /= np.sum(preds, axis=1, keepdims=True)
lbl_preds = argmax_decode(np.argmax(preds, axis=-1))

In [None]:
o = np.argsort(np.mean(logits[:, :60], axis=0))
plt.figure(figsize=(10, 10))
for c in o:
    plt.plot(np.arange(len(logits)), logits[:, c]);

plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=":");
plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)
plt.show()

In [None]:
preds[:, -1]

In [None]:
model.layers[4].get_weights()

In [None]:
weights