# Requirements

Please download the timit dataset at http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 and place the TIMIT.zip file next to this file.

The following python packages are required:
- lasagne
- matplotlib
- [sphfile](https://pypi.python.org/pypi/sphfile) (to read the sound files)
- [python_speech_features](https://github.com/jameslyons/python_speech_features) (to generate mfcc features)

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.environ['THEANO_FLAGS'] = "device=cpu"

In [None]:
import pickle as pkl
import numpy as np
from zipfile import ZipFile
from sphfile import SPHFile
from python_speech_features import mfcc
import lasagne
from lasagne.layers import InputLayer, GaussianNoiseLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer
import theano
import theano.tensor as T
from theano.compile.nanguardmode import NanGuardMode
import matplotlib.pyplot as plt
from ctc import ctc_loss, log_softmax, insert_alternating_blanks, ctc_backward

# Prepare dataset

In [None]:
if not os.path.isdir("data/lisa/data/timit/raw/TIMIT"):
    assert os.path.exists("TIMIT.zip"), "Missing data archive"
    with ZipFile("TIMIT.zip", 'r') as f:
        f.extractall(path=".")

In [None]:
files = []
train_subset = []

for dirpath, _, filenames in os.walk("data/lisa/data/timit/raw/TIMIT"):
    for f in filenames:
        if f.endswith("WAV"):
            recording = SPHFile(dirpath + "/" + f).content
            files.append(dirpath + "/" + f[:-4])
            train_subset.append(dirpath[31:36] == "TRAIN")

files = np.array(files)
train_subset = np.array(train_subset, dtype=np.bool)

# Preprocessing

In [None]:
if not os.path.exists("preprocessed_dataset.pkl"):
    features = []
    labels = []

    for f in files:
        recording = SPHFile(f + ".WAV")
        signal = recording.content
        samplerate = recording.format['sample_rate']

        mfccfeats = mfcc(signal, samplerate=samplerate, winlen=0.01, winstep=0.005, 
                         numcep=13, nfilt=26, appendEnergy=True)
        derivatives = np.concatenate([
            mfccfeats[1, None] - mfccfeats[0, None],
            .5 * mfccfeats[2:] - .5 * mfccfeats[0:-2],
            mfccfeats[-1, None] - mfccfeats[-2, None]], axis=0)

        features.append(np.concatenate([mfccfeats, derivatives], axis=1).astype(np.float32))

        with open(f + ".PHN") as phonem_file:
            labels.append([l.split()[2] for l in phonem_file.readlines()])

    m = np.mean(np.concatenate(features, axis=0))
    s = np.std(np.concatenate(features, axis=0))

    for i in range(len(features)):
        features[i] = (features[i] - m) / s

    vocabulary = set()
    for lseq in labels:
        vocabulary |= set(lseq)

    vocabulary = list(vocabulary)
    vocabulary[-1], vocabulary[vocabulary.index('h#')] = vocabulary[vocabulary.index('h#')], vocabulary[-1]

    for i in range(len(labels)):
        labels[i] = np.array([vocabulary.index(l) for l in labels[i]], dtype=np.int32)

    blank = len(labels) - 1
    
    with open("preprocessed_dataset.pkl", 'wb') as f:
        pkl.dump((features, labels, vocabulary, blank), f, -1)


with open("preprocessed_dataset.pkl", 'rb') as f:
    features, labels, vocabulary, blank = pkl.load(f)

# Model

In [None]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

class SmallGaussianNoiseLayer(lasagne.layers.Layer):
    """Gaussian noise layer (clipped for safety)"""
    def __init__(self, incoming, sigma=0.1, **kwargs):
        super(SmallGaussianNoiseLayer, self).__init__(incoming, **kwargs)
        self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
        self.sigma = sigma

    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic or self.sigma == 0:
            return input
        else:
            noise = self._srng.normal(input.shape, avg=0.0, std=self.sigma)
            return input + T.clip(noise, -3 * self.sigma, 3 * self.sigma)

In [None]:
l_in = InputLayer(shape=(None, 1557,  26))
l_duration = InputLayer(input_var=T.ivector(name="duration"), shape=(1,))
l_mask = lasagne.layers.ExpressionLayer(l_duration, lambda d: T.arange(1557)[None, :] < d[:, None])
l_noise = SmallGaussianNoiseLayer(l_in, sigma=0.6)
l_fwlstm = LSTMLayer(
    l_noise, 100, mask_input=l_mask)
l_bwlstm = LSTMLayer(
    l_noise, 100, mask_input=l_mask,
    backwards=True)
l_cat = ConcatLayer([l_fwlstm, l_bwlstm], axis=2)
l_linout = DenseLayer(l_cat, len(vocabulary), nonlinearity=None, num_leading_axes=2)

input_var = l_in.input_var
duration_var = l_duration.input_var
labels_var = T.imatrix()

# Training

In [None]:
train_output = lasagne.layers.get_output(l_linout, deterministic=False).dimshuffle(1, 0, 2)

loss = ctc_loss(
    linout=train_output,
    durations=duration_var,
    labels=labels_var,
    label_sizes=T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),
    blank=blank
)

params = lasagne.layers.get_all_params(l_linout, trainable=True)
grads = theano.grad(loss.sum(), params)
updates = lasagne.updates.nesterov_momentum(grads, params, learning_rate=1e-4)

update_fn = theano.function(
    [input_var, duration_var, labels_var], 
    loss, 
    updates=updates,
    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
    )

In [None]:
params_history = []
loss_history = []
running_loss = None
failed = []

In [None]:
for e in range(10):
    for i in np.random.permutation(len(labels)):
        f, l  = features[i][None, :, :], labels[i][None, 1:-1]
        d = np.array([f.shape[1]], dtype=np.int32)
        f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)

        batch_loss = float(update_fn(f, d, l))

        if batch_loss > 10000 or np.isnan(batch_loss):
            print("\nskipped i = {} because loss was {}".format(i, batch_loss))
            raise RuntimeError()
        else:
            running_loss = batch_loss if running_loss is None else .99 * running_loss + .01 * batch_loss
            print("\r{:4d} loss = {:>5.0f} -> {:>5.0f}".format(i, batch_loss, running_loss), end='', flush=True)

        if i % 25:
            params_history.append(lasagne.layers.get_all_param_values(l_linout))
            loss_history.append(running_loss)

    #         batch_loss = loss_fn(f, l)
    #         if batch_loss > 5000:
    #             print('loss = {:>5.0f} > 5000 at element {:d}'.format(batch_loss, i))
    #             raise
    #         else:

In [None]:
plt.plot(loss_history)
plt.yscale('log')

In [None]:
np.argmin(loss_history[::25])

In [None]:
lasagne.layers.set_all_param_values(l_linout, params_history[6000//25])

# Evaluate model

In [None]:
i = 0
f, l  = features[i][None, :, :], labels[i][None, 1:-1]
f = np.concatenate([f, np.zeros((1557 - f.shape[1], f.shape[2]), dtype=np.float32)[None, :, :]], axis=1)

In [None]:
blanked_labels = insert_alternating_blanks(labels_var, blank)
not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
betas = ctc_backward(
    log_softmax(T.unbroadcast(train_output.dimshuffle(1, 0, 2), 1)),
    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), 
    blanked_labels,
    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),
    not_repeated)
test_output = lasagne.layers.get_output(l_linout, deterministic=True)

loss_fn = theano.function([input_var, duration_var, labels_var], loss)
beta_fn = theano.function([input_var, duration_var, labels_var], betas)
grads_fn = theano.function([input_var, duration_var, labels_var], grads)
predict_fn = theano.function([input_var, duration_var], T.exp(log_softmax(test_output[:, 0, :])))
logits_fn = theano.function([input_var, duration_var], test_output)

In [None]:
logits = logits_fn(f, d)[0]

In [None]:
o = np.argsort(np.mean(logits[:, :60], axis=0))
plt.figure(figsize=(10, 10))
for c in o:
    plt.plot(np.arange(len(logits)), logits[:, c])

plt.plot(np.arange(len(logits)), logits[:, -1], linestyle=":")
plt.legend([vocabulary[o_] for o_ in o] + [vocabulary[-1]], bbox_to_anchor=(.6, 0, 1, 1), ncol=5)
plt.show()

# beta

In [None]:
features[i].shape[0]

In [None]:
betas = ctc_backward(
    log_softmax(train_output),
    T.cast(T.reshape(train_output.shape[0], (1,)), 'int32'), 
    blanked_labels,
    T.cast(T.reshape(labels_var.shape[1], (1,)), 'int32'),
    not_repeated)
beta_fn = theano.function([input_var, duration_var, labels_var], betas)

b = beta_fn(f, d, l)

plt.figure(figsize=(10, 6))
plt.imshow(b[0:, 0, 0:], clim=(-5000, max(0, np.max(b))))
plt.gca().set_aspect(0.1)
plt.colorbar()
plt.show()

In [None]:
p = lasagne.layers.get_all_param_values(l_linout, trainable=True)
for p_ in p:
    print((p_.min(), p_.max()))

In [None]:
g = theano.grad(loss.sum(), wrt=train_output).eval({
    input_var: f,
    duration_var: d,
    labels_var: l
})

In [None]:
plt.subplot(2, 1, 1)
plt.bar(np.arange(len(vocabulary)), g[:, 0, np.concatenate((o, [60]))].mean(axis=0))
plt.subplot(2, 1, 2)
plt.plot(g[:, 0, :].mean(axis=1))

In [None]:
len(vocabulary)

In [None]:
def argmax_decode(preds):
    decoded = [preds[0]]
    for v in preds:
        if v != decoded[-1]:
            decoded.append(v)
    
    return np.array(decoded, dtype=np.int32)

lbl_preds = argmax_decode(np.argmax(logits, axis=-1))
lbl_tgt = labels[i]