Use Keras model, use TCN, reuse haiku output from attention tests

In [1]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip install keras-tcn




In [2]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf


from keras.utils import plot_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import sys
import time

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Nadam, Adam
from tcn import TCN

Using TensorFlow backend.
W0613 03:14:36.423183 139994818779008 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
!pip uninstall -y deepmeter
!pip install git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter
from cmu.syllables_cmu import syllables
print(syllables['therefore'])


Uninstalling DeepMeter-0.0.1:
  Successfully uninstalled DeepMeter-0.0.1
Collecting deepmeter from git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter
  Cloning https://github.com/LanceNorskog/deep_meter_2 to /tmp/pip-install-7e9r2gha/deepmeter
  Running command git clone -q https://github.com/LanceNorskog/deep_meter_2 /tmp/pip-install-7e9r2gha/deepmeter
Building wheels for collected packages: deepmeter
  Building wheel for deepmeter (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-fg6p_zfw/wheels/80/6c/4f/1c43367a928a82b45cb3d36b4f23720b249748714869e28f13
Successfully built deepmeter
Installing collected packages: deepmeter
Successfully installed deepmeter-0.0.1
['DH EH R', 'F AO R']


In [4]:
# Mine
!wget -nc https://raw.githubusercontent.com/LanceNorskog/test_data/master/haiku_5.txt
path_to_file = 'haiku_5.txt'
!mkdir -p training_checkpoints


File ‘haiku_5.txt’ already there; not retrieving.



Load Haiku data. Filter for bogus inputs. Create 2-directional map of known syllables, and matching arrays of ['A sentence for you',...] and syllables indexes of [['AH', 'S EH N', 'T EH NS', 'F OR', 'YU']]. Use CMU Pronunciation Dictionary for syllables (some guy's syllabized version).

In [0]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [0]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [Long clause, Haiku line]
def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return word_pairs

In [0]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for word in self.lang:
        #print(type(word))
        #print(word)
        if type(word) == type(''):
            #if not word in self.vocab:
             #   print('. adding: {}'.format(word))
            #print(word)
            self.vocab.update([word])
        else:
            for part in word:
               #if not part in self.vocab:
               #    print('. adding: {}'.format(part))
               self.vocab.update([part])
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<pad>'] = 0
    for index, word in enumerate(self.vocab):
      #if index < 20:
      #    print('Vocab[{}]: "{}"'.format(index, word))
      if word in ['<start>', 'B AE K', 'big']:
           print("Adding syllable: {} as index {}".format(word, index))
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word

In [0]:
def load_dataset(path, num_examples):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)
    print(type(pairs))
    print(len(pairs))
    print(len(pairs[0]))
    print(len(pairs[0][0]))
    print(len(pairs[1][1]))
    
    # index language using the class defined above    
    sylls = []
    for parts in (hk.split(' ') for en, hk in pairs):
        for word in parts:
            for syll in syllables[word]:
                sylls.append(syll)
    print('sylls[0:3] {}'.format(sylls[0:3]))
    print('# sylls: {}'.format(len(sylls)))
    #targ_lang = LanguageIndex(enumerate(syllables[word] for word in (parts for parts in (hk.split(' ') for en, hk in pairs))))
    targ_lang = LanguageIndex(sylls)
    
    print('Back[<start>]: {}'.format('<start>' in targ_lang.word2idx))
    print('Back[B AE K]: {}'.format('B AE K' in targ_lang.word2idx))
    
    # Vectorize the input and target languages
    
    # Raw input text, since that's what USE wants.
    input_tensor = []
    # Haiku lines
    target_tensor = []
    for i in range(len(pairs)):
        hk = pairs[i][1]
        syll_indexes = []
        if i < 5:
            print('haiku[{}]: {}'.format(i, hk))
        syll_count = 0
        for word in hk.split(' '):
            #print('word[{}][0]: {}: '.format(i, word))
            if word in syllables:
                for syll in syllables[word]:
                    # print('.  syll: ' + syll)
                    if syll not in ['<start>','<end>', ',', '.']:
                        syll_indexes.append(targ_lang.word2idx[syll])
                        syll_count += 1
            else:
                print('text[{}]: word had no syllables: {}'.format(i, word))
        if syll_count == 5:
            input_tensor.append(pairs[i][0])
            target_tensor.append(syll_indexes)
        else:
            print('wrong[{}] {}'.format(i, str(pairs[i][1])))
            print('.....: ' + str(syll_indexes))
                                                            
    print(target_tensor[0])
    
    return input_tensor, target_tensor, targ_lang, 5

In [0]:
syllables['<start>'] = ['<start>']
syllables['<end>'] = ['<end>']
syllables[','] = [',']
syllables['.'] = ['.']

In [10]:
# Try experimenting with the size of that dataset
num_examples = 1000
input_tensor, target_tensor, targ_lang, max_length_targ = load_dataset(path_to_file, num_examples)
num_syllables = len(targ_lang.idx2word)

<class 'list'>
1000
2
37
28
sylls[0:3] ['<start>', 'B AE K', 'Y AA R D Z']
# sylls: 7006
Adding syllable: <start> as index 2
Adding syllable: B AE K as index 39
Back[<start>]: True
Back[B AE K]: True
haiku[0]: <start> backyards to stockyards <end>
haiku[1]: <start> alameda street <end>
haiku[2]: <start> alameda street <end>
haiku[3]: <start> waiting for their ride <end>
haiku[4]: <start> of toilet paper <end>
wrong[575] <start> ready for use <end>
.....: [462, 141, 170, 640]
[40, 637, 595, 524, 637]


In [11]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print('{}, {}, {}, {}'.format(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)))
print(target_tensor_train[0])
# (x, 5, num_syllables)
def fromdict_sparse(syllables, indexes, size=5):
    tensor = np.zeros((len(indexes), size, num_syllables)) + 0.01
    for i in range(len(indexes)):
        for j in range(5):
                tensor[i][j][indexes[i]] = 0.99
    print(indexes[0])
    print(tensor[0])
    return tensor

def fromdict(syllables, indexes, size=5):
    tensor = np.zeros((len(indexes), size, 1)) + 0.01
    for i in range(len(indexes)):
        for j in range(5):
            tensor[i][j][0] = indexes[i][j]
    print(indexes[0])
    print(tensor[0])
    return tensor

799, 799, 200, 200
[16, 105, 557, 166, 103]


![alt text]### Create **model**

In [0]:
#BUFFER_SIZE = len(input_tensor_train)
#BATCH_SIZE = 64
#N_BATCH = BUFFER_SIZE//BATCH_SIZE
tcn_embedding_dim = 256
#units = 1024
vocab_tar_size = len(targ_lang.word2idx)

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [14]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)
# size of embedding passed between Encoder and Decoder
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
print('USE embed size: {}'.format(embed_size))

def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

Instructions for updating:
Colocations handled automatically by placer.


W0613 03:14:52.325236 139994818779008 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


USE embed size: 512


In [0]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
  # keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, return_sequences=False, return_state=False, stateful=False)
  if tf.test.is_gpu_available():
    return layers.CuDNNGRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_initializer='glorot_uniform')
  else:
    return layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform')

In [16]:

# slow
num_epochs = 10
adam_lr = 0.001
adam_opt = Adam(lr=adam_lr)
nadam_opt = tf.contrib.opt.NadamOptimizer(adam_lr)
output_activation='sigmoid'
dropout=0.5


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [0]:
# from compiled_tcn

# https://github.com/keras-team/keras/pull/11373
# It's now in Keras@master but still not available with pip.
# TODO remove later.
def accuracy(y_true, y_pred):
            # reshape in case it's in shape (num_samples, 1) instead of (num_samples,)
            if K.ndim(y_true) == K.ndim(y_pred):
                y_true = K.squeeze(y_true, -1)
            # convert dense predictions to labels
            y_pred_labels = K.argmax(y_pred, axis=-1)
            y_pred_labels = K.cast(y_pred_labels, K.floatx())
            return K.cast(K.equal(y_true, y_pred_labels), K.floatx())



In [18]:

# changed accuracy from 'choose your own accuracy'
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,), name='TF-Hub')(input_text)
embedding = layers.RepeatVector(5)(embedding)
tcn = TCN(nb_filters=64, kernel_size=2, nb_stacks=1, dilations=[1,2,4,8], padding='same',
            use_skip_connections=False, dropout_rate=0.05, return_sequences=True, name='tcn')(embedding)

print('tcn.shape=', tcn.shape)
output_layer = layers.Dense(vocab_tar_size)(tcn)
output_layer = layers.Activation('softmax')(output_layer)

model = Model(inputs=[input_text], outputs=output_layer)

model.compile(nadam_opt, loss='sparse_categorical_crossentropy', metrics=[accuracy])

model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0613 03:14:54.837409 139994818779008 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0613 03:14:55.018860 139994818779008 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


tcn.shape= (?, 5, 64)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
TF-Hub (Lambda)                 (None, 512)          0           input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 5, 512)       0           TF-Hub[0][0]                     
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 5, 64)        32832       repeat_vector_1[0][0]            
_______________________________________________________________________________________

In [20]:
history = None
use_saved_model=False
if not use_saved_model or not os.path.exists('./model.h5'):
  with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(np.array(input_tensor_train), 
            fromdict(syllables, target_tensor_train, 5),
            #np.array(target_tensor_train),
            validation_data=(np.array(input_tensor_val), fromdict(syllables, target_tensor_val, 5)),
            #validation_data=(np.array(input_tensor_val), np.array(target_tensor_val)),
            epochs=num_epochs,
            callbacks = [EarlyStopping(patience=5)],
            batch_size=32,
            verbose=2
    )
    model.save_weights('./model.h5')


[16, 105, 557, 166, 103]
[[ 16.]
 [105.]
 [557.]
 [166.]
 [103.]]
[407, 394, 16, 78, 140]
[[407.]
 [394.]
 [ 16.]
 [ 78.]
 [140.]]
Instructions for updating:
Use tf.cast instead.


W0613 03:16:12.119290 139994818779008 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


W0613 03:16:12.204053 139994818779008 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:102: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Train on 799 samples, validate on 200 samples
Epoch 1/10
 - 8s - loss: 5.4189 - accuracy: 0.1670 - val_loss: 4.9491 - val_accuracy: 0.1800
Epoch 2/10
 - 1s - loss: 4.5511 - accuracy: 0.2133 - val_loss: 4.6903 - val_accuracy: 0.2110
Epoch 3/10
 - 1s - loss: 4.1999 - accuracy: 0.2468 - val_loss: 4.5672 - val_accuracy: 0.2310
Epoch 4/10
 - 1s - loss: 3.9591 - accuracy: 0.2578 - val_loss: 4.4128 - val_accuracy: 0.2340
Epoch 5/10
 - 1s - loss: 3.7483 - accuracy: 0.2723 - val_loss: 4.2941 - val_accuracy: 0.2440
Epoch 6/10
 - 1s - loss: 3.5765 - accuracy: 0.2826 - val_loss: 4.2394 - val_accuracy: 0.2650
Epoch 7/10
 - 1s - loss: 3.4104 - accuracy: 0.2996 - val_loss: 4.2748 - val_accuracy: 0.2680
Epoch 8/10
 - 1s - loss: 3.2616 - accuracy: 0.3169 - val_loss: 4.2040 - val_accuracy: 0.2820
Epoch 9/10
 - 1s - loss: 3.1340 - accuracy: 0.3327 - val_loss: 4.1924 - val_accuracy: 0.3000
Epoch 10/10
 - 1s - loss: 3.0361 - accuracy: 0.3379 - val_loss: 4.2040 - val_accuracy: 0.3160


In [25]:

with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  #model.load_weights('./model.h5')  
  predicts = model.predict(np.array(input_tensor_train), batch_size=32)
  print('shape: {}'.format(predicts.shape))

print(len(predicts[0]))
print(len(predicts[0][0]))
print(predicts[0])

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7f527e2bf940>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1455, in __del__
    self._session._session, self._handle, status)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
    c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.CancelledError: Session has been closed.


shape: (799, 5, 649)
5
649
[[0.00150616 0.00147162 0.00150401 ... 0.00160234 0.0015857  0.00154726]
 [0.00151585 0.00146712 0.00151706 ... 0.00158246 0.00158248 0.00154761]
 [0.00151213 0.00149036 0.00151312 ... 0.00159387 0.00155391 0.00157698]
 [0.00147679 0.00148687 0.00152369 ... 0.00156343 0.0015447  0.00154435]
 [0.00148117 0.00151662 0.00153047 ... 0.00157927 0.001538   0.00156293]]


In [24]:
def maxindx(pred):
    maxi=0
    maxv=pred[0]
    for x in range(len(pred)-1):
        if pred[x + 1] > maxv:
            maxi = x + 1
    return (maxi, maxv)

for i in range(5):
    (ind, v) = maxindx(predicts[0][i])
    print('syll[{}] = {}, value={}'.format(i, ind, v))
    print('... "{}"'.format(targ_lang.idx2word[ind]))

syll[0] = 648, value=0.0015061587328091264
... "ZH ER"
syll[1] = 648, value=0.0015158526366576552
... "ZH ER"
syll[2] = 648, value=0.0015121296746656299
... "ZH ER"
syll[3] = 648, value=0.0014767881948500872
... "ZH ER"
syll[4] = 648, value=0.0014811678556725383
... "ZH ER"
