Use Keras model, use TCN, reuse haiku output from attention tests

In [0]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub


In [0]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution
import tensorflow as tf


from keras.utils import plot_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import sys
import time

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Nadam, Adam


In [44]:
!pip uninstall -y deepmeter
!pip install git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter
from cmu.syllables_cmu import syllables
print(syllables['therefore'])


Uninstalling DeepMeter-0.0.1:
  Successfully uninstalled DeepMeter-0.0.1
Collecting deepmeter from git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter
  Cloning https://github.com/LanceNorskog/deep_meter_2 to /tmp/pip-install-_jyh5gbn/deepmeter
  Running command git clone -q https://github.com/LanceNorskog/deep_meter_2 /tmp/pip-install-_jyh5gbn/deepmeter
Building wheels for collected packages: deepmeter
  Building wheel for deepmeter (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-c7hll74b/wheels/80/6c/4f/1c43367a928a82b45cb3d36b4f23720b249748714869e28f13
Successfully built deepmeter
Installing collected packages: deepmeter
Successfully installed deepmeter-0.0.1


['DH EH R', 'F AO R']


In [45]:
# Mine
!wget -nc https://raw.githubusercontent.com/LanceNorskog/test_data/master/haiku_5.txt
path_to_file = 'haiku_5.txt'
!mkdir -p training_checkpoints


File ‘haiku_5.txt’ already there; not retrieving.



Load Haiku data. Filter for bogus inputs. Create 2-directional map of known syllables, and matching arrays of ['A sentence for you',...] and syllables indexes of [['AH', 'S EH N', 'T EH NS', 'F OR', 'YU']]. Use CMU Pronunciation Dictionary for syllables (some guy's syllabized version).

In [0]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [0]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [Long clause, Haiku line]
def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return word_pairs

In [0]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for word in self.lang:
        #print(type(word))
        #print(word)
        if type(word) == type(''):
            #if not word in self.vocab:
             #   print('. adding: {}'.format(word))
            #print(word)
            self.vocab.update([word])
        else:
            for part in word:
               #if not part in self.vocab:
               #    print('. adding: {}'.format(part))
               self.vocab.update([part])
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<pad>'] = 0
    for index, word in enumerate(self.vocab):
      #if index < 20:
      #    print('Vocab[{}]: "{}"'.format(index, word))
      if word in ['<start>', 'B AE K', 'big']:
           print("Adding syllable: {} as index {}".format(word, index))
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word

In [0]:
def load_dataset(path, num_examples):
    # creating cleaned input, output pairs
    pairs = create_dataset(path, num_examples)
    print(type(pairs))
    print(len(pairs))
    print(len(pairs[0]))
    print(len(pairs[0][0]))
    print(len(pairs[1][1]))
    
    # index language using the class defined above    
    sylls = []
    for parts in (hk.split(' ') for en, hk in pairs):
        for word in parts:
            for syll in syllables[word]:
                sylls.append(syll)
    print('sylls[0:3] {}'.format(sylls[0:3]))
    print('# sylls: {}'.format(len(sylls)))
    #targ_lang = LanguageIndex(enumerate(syllables[word] for word in (parts for parts in (hk.split(' ') for en, hk in pairs))))
    targ_lang = LanguageIndex(sylls)
    
    print('Back[<start>]: {}'.format('<start>' in targ_lang.word2idx))
    print('Back[B AE K]: {}'.format('B AE K' in targ_lang.word2idx))
    
    # Vectorize the input and target languages
    
    # Raw input text, since that's what USE wants.
    input_tensor = []
    # Haiku lines
    target_tensor = []
    for i in range(len(pairs)):
        hk = pairs[i][1]
        syll_indexes = []
        if i < 5:
            print('haiku[{}]: {}'.format(i, hk))
        syll_count = 0
        for word in hk.split(' '):
            #print('word[{}][0]: {}: '.format(i, word))
            if word in syllables:
                for syll in syllables[word]:
                    # print('.  syll: ' + syll)
                    syll_indexes.append(targ_lang.word2idx[syll])
                    if syll not in ['<start>','<end>', ',', '.']:
                        syll_count += 1
            else:
                print('text[{}]: word had no syllables: {}'.format(i, word))
        if syll_count == 5:
            input_tensor.append(pairs[i][0])
            target_tensor.append(syll_indexes)
        else:
            print('wrong[{}] {}'.format(i, str(pairs[i][1])))
            print('.....: ' + str(syll_indexes))
                                                            
    print(target_tensor[0])
    
    return input_tensor, target_tensor, targ_lang, 5

In [0]:
syllables['<start>'] = ['<start>']
syllables['<end>'] = ['<end>']
syllables[','] = [',']
syllables['.'] = ['.']

In [51]:
# Try experimenting with the size of that dataset
num_examples = 1000
input_tensor, target_tensor, targ_lang, max_length_targ = load_dataset(path_to_file, num_examples)
num_syllables = len(targ_lang.idx2word)

<class 'list'>
1000
2
37
28
sylls[0:3] ['<start>', 'B AE K', 'Y AA R D Z']
# sylls: 7006
Adding syllable: <start> as index 2
Adding syllable: B AE K as index 39
Back[<start>]: True
Back[B AE K]: True
haiku[0]: <start> backyards to stockyards <end>
haiku[1]: <start> alameda street <end>
haiku[2]: <start> alameda street <end>
haiku[3]: <start> waiting for their ride <end>
haiku[4]: <start> of toilet paper <end>
wrong[575] <start> ready for use <end>
.....: [3, 462, 141, 170, 640, 2]
[3, 40, 637, 595, 524, 637, 2]


In [0]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)
def fromdict(syllables, indexes, size=5):
    tensor = np.zeros((len(indexes), num_syllables * size)) + 0.01
    for i in range(len(indexes)):
        for j in range(size):
            tensor[i][indexes[j]] = 0.99
    print(indexes[0])
    print(tensor[0])
    return tensor

![alt text]### Create **model**

In [0]:
#BUFFER_SIZE = len(input_tensor_train)
#BATCH_SIZE = 64
#N_BATCH = BUFFER_SIZE//BATCH_SIZE
tcn_embedding_dim = 256
#units = 1024
vocab_tar_size = len(targ_lang.word2idx)

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [55]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)
# size of embedding passed between Encoder and Decoder
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
print('USE embed size: {}'.format(embed_size))

def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

USE embed size: 512


In [0]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
  # keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, return_sequences=False, return_state=False, stateful=False)
  if tf.test.is_gpu_available():
    return layers.CuDNNGRU(units, 
                                    return_sequences=True, 
                                    return_state=True, 
                                    recurrent_initializer='glorot_uniform')
  else:
    return layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform')

In [0]:

# slow
num_epochs = 10
adam_lr = 0.001
adam_opt = Adam(lr=adam_lr)
nadam_opt = tf.contrib.opt.NadamOptimizer(adam_lr)
output_activation='sigmoid'
dropout=0.5

In [58]:

# changed accuracy from 'choose your own accuracy'
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,), name='TF-Hub')(input_text)
embedding = layers.Dropout(0.5)(embedding)
dense = layers.Dense(1024, activation='relu', name='Convoluted')(embedding)
dense = layers.Dropout(0.5)(dense)
pred = layers.Dense(5 * vocab_tar_size, activation=output_activation, name='Flatout')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', 
              optimizer=nadam_opt, 
              metrics=['binary_crossentropy']
             )
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0612 02:29:48.620183 139867708565376 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
TF-Hub (Lambda)              (None, 512)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)               0         
_________________________________________________________________
Convoluted (Dense)           (None, 1024)              525312    
_________________________________________________________________
dropout_12 (Dropout)         (None, 1024)              0         
_________________________________________________________________
Flatout (Dense)              (None, 3245)              3326125   
Total params: 3,851,437
Trainable params: 3,851,437
Non-trainable params: 0
_________________________________________________________________


In [59]:
history = None
use_saved_model=False
if not use_saved_model or not os.path.exists('./model.h5'):
  with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(np.array(input_tensor_train), 
            fromdict(syllables, target_tensor_train, 5),
            validation_data=(np.array(input_tensor_val), fromdict(syllables, target_tensor_val, 5)),
            epochs=num_epochs,
            callbacks = [EarlyStopping(patience=5)],
            batch_size=32,
            verbose=2
    )
    model.save_weights('./model.h5')


[3, 24, 16, 497, 583, 334, 2]
[0.01 0.01 0.99 ... 0.01 0.01 0.01]
[3, 16, 58, 494, 286, 530, 2]
[0.01 0.01 0.99 ... 0.01 0.01 0.01]
Instructions for updating:
Use tf.cast instead.


W0612 02:29:59.328920 139867708565376 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Train on 799 samples, validate on 200 samples
Epoch 1/10
 - 11s - loss: 0.2716 - binary_crossentropy: 0.2716 - val_loss: 0.1072 - val_binary_crossentropy: 0.1072
Epoch 2/10
 - 1s - loss: 0.0645 - binary_crossentropy: 0.0645 - val_loss: 0.1120 - val_binary_crossentropy: 0.1120
Epoch 3/10
 - 1s - loss: 0.0596 - binary_crossentropy: 0.0596 - val_loss: 0.0977 - val_binary_crossentropy: 0.0977
Epoch 4/10
 - 1s - loss: 0.0582 - binary_crossentropy: 0.0582 - val_loss: 0.0979 - val_binary_crossentropy: 0.0979
Epoch 5/10
 - 1s - loss: 0.0581 - binary_crossentropy: 0.0581 - val_loss: 0.0978 - val_binary_crossentropy: 0.0978
Epoch 6/10
 - 1s - loss: 0.0576 - binary_crossentropy: 0.0576 - val_loss: 0.0977 - val_binary_crossentropy: 0.0977
Epoch 7/10
 - 1s - loss: 0.0578 - binary_crossentropy: 0.0578 - val_loss: 0.0977 - val_binary_crossentropy: 0.0977
Epoch 8/10
 - 1s - loss: 0.0575 - binary_crossentropy: 0.0575 - val_loss: 0.0975 - val_binary_crossentropy: 0.0975
Epoch 9/10
 - 1s - loss: 0.0575 -