Use Cyber-ZHG's various Keras tools for NLP: Attention, Multi-head, etc.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
#!pip install numpy==1.16.1
!pip install keras==2.2.3
!wget -nc https://raw.githubusercontent.com/LanceNorskog/deep_meter_2/master/haiku_5.txt
!cut -f2 < haiku_5.txt | sort | uniq > haiku_5_short.txt
!wc -l haiku_5*.txt

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
File ‘haiku_5.txt’ already there; not retrieving.

   95631 haiku_5_short.txt
  673680 haiku_5.txt
  769311 total


In [2]:
!pip uninstall -qy git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter
!pip install -q git+https://github.com/LanceNorskog/deep_meter_2#egg=deepmeter


  Building wheel for deepmeter (setup.py) ... [?25l[?25hdone


In [3]:

from __future__ import print_function
import math
import pickle
import json
import os
import glob
import time

import numpy as np
import tensorflow as tf
import keras.backend as K
import matplotlib.pyplot as plt

from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model, Sequential, load_model
from keras import layers 
from keras import metrics
from keras.preprocessing import text
from sklearn.model_selection import train_test_split

from cmu.syllables_cmu import syllables as word2sylls
from cmu.mappers import Decoder, trim_homynyms
from cmu.full import FullSearch
from cmu.topk import get_top_k, decodem, short_sentences
from cmu.wordmap import Wordmap
from cmu.readhaiku import Reader

#from cmu.report import find_top_k_match, report
from keras_stuff.loss import sparse_categorical_crossentropy as scc
#from keras_stuff.loss import sparse_categorical_crossentropy_temporal as scct
import keras_stuff.metrics as my_metrics

print(word2sylls['therefore'])

# number of total samples to use
max_data = 100000
# number of words for hashing trick
hash_mole = 20000
# number of output syllables in short haiku
max_features = 17000
# longest output sentence
num_sylls = 5
# longest input sentence
max_words = 12
# what you think
batch_size = 32
# do not output the same haiku twice
deduplicate_haiku=False
# emit output as input
duplicate_haiku=True
# use long as input
use_big_text=True

model_base="/content/gdrive/My Drive/Colab Notebooks/haiku_zhg_5"
model_file=model_base + ".h5"
print(model_file)


Using TensorFlow backend.


['DH EH R', 'F AO R']
/content/gdrive/My Drive/Colab Notebooks/haiku_zhg_5.h5


In [4]:
!date
print(word2sylls['door'])
#word2sylls = trim_homynyms(word2sylls)
print(word2sylls['door'])
decoder = Decoder(word2sylls)
syll2idx = decoder.syll2idx
idx2syll = decoder.idx2syll

print(syll2idx['DH EH R'], idx2syll[1])
print('# features: ', len(idx2syll))

for i in range(decoder.wordoff):
    decoder.wordlist[i] = 'word{}'.format(i)
    decoder.wordlength[i] = 1
for i in range(decoder.sylloff):
    decoder.idx2syll[i] = 'syll{}'.format(i)

big_haiku_file = "haiku_5.txt"
wordmap = Wordmap(len(decoder.wordlist))
reader = Reader(word2sylls, decoder, wordmap)
(big_text, big_haiku, big_data) = reader.readfile(big_haiku_file, max_words=max_words, 
    deduplicate_haiku=deduplicate_haiku, duplicate_haiku=duplicate_haiku, max_data=max_data)
if use_big_text:
    input_text = big_text
else:
    input_text = big_haiku
big_hash = reader.gethash(input_text, max_words=max_words, hash_mole=hash_mole)
haikuwordset = reader.haikuwordset
print('{} -> {} : {}'.format(big_text[0], big_haiku[0], big_data[0]))

print('Full length clauses: ', len(big_text))
print('Wordmap total entries: ', wordmap.count())
print('Wordmap length: ', wordmap.length())

Mon Jul 29 02:10:16 UTC 2019
['D AO R']
['D AO R']
2443 0
# features:  15098
a white sink and door -> a white sink and door : [[  156]
 [14238]
 [10115]
 [  125]
 [ 1844]]
Full length clauses:  100001
Wordmap total entries:  12388
Wordmap length:  229463


In [5]:
# Split multiple datasets across same index
(train_i, test_i, _, _) = train_test_split(np.arange(len(big_data)), np.arange(len(big_data)))

train_len=(len(train_i)//batch_size) * batch_size
test_len=(len(test_i)//batch_size) * batch_size
x_train = big_hash[train_i][:train_len]
y_train = big_data[train_i][:train_len]
x_test = big_hash[test_i][-test_len:]
y_test = big_data[test_i][-test_len:]

print(input_text[train_i[0]], x_train[0], str(y_test[0]))

def get_lstm(size, return_sequences=True):
    #return layers.LSTM(size, return_sequences=return_sequences)
    return layers.CuDNNLSTM(size, return_sequences=return_sequences)

#x_train = np.array(x_train)
#x_test = np.array(x_test)
#y_train = np.expand_dims(y_train, -1)
#y_test = np.expand_dims(y_test, -1)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
print(y_test[0][0])



her hands on her hips [18127. 13619.   815. 18127.  3193.     0.     0.     0.     0.     0.
     0.     0.] [[2425]
 [ 394]
 [ 208]
 [4724]
 [ 575]]
x_train shape: (74976, 12)
x_test shape: (24992, 12)
y_train shape: (74976, 5, 1)
y_test shape: (24992, 5, 1)
[2425]


In [6]:


embed_size=512
units_k=embed_size
units_v=embed_size
units_v=embed_size//3
units=512
dropout=0.5

metric_list = [my_metrics.sparse, my_metrics.perfect]
metric_names = ['sparse', 'perfect']

hash_input = layers.Input(shape=(max_words,), dtype='int32')
x = layers.Embedding(hash_mole, embed_size, input_length=max_words)(hash_input)
x = layers.Dropout(dropout)(x)
x = layers.Bidirectional(get_lstm(units//2, return_sequences=False))(x)
x = layers.RepeatVector(num_sylls)(x)
x = get_lstm(units, return_sequences=True)(x)
x = layers.Dropout(dropout)(x)
output_layer = layers.Dense(max_features, activation='softmax')(x)
#output_layer = layers.TimeDistributed(layers.Dense(max_features, activation='softmax'))(x)


model = Model(inputs=[hash_input], outputs=[output_layer])
model.compile('adam', loss='sparse_categorical_crossentropy', metrics=metric_list)
model.summary()

# try using different optimizers and different optimizer configs

print('Train...')
history = None
use_saved_model=True
if not use_saved_model or not os.path.exists(model_file):
  with tf.Session() as session:
    K.manual_variable_initialization(False)
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          callbacks=[EarlyStopping(monitor='val_perfect', mode='max', verbose=1, patience=10),
            ModelCheckpoint(model_file, monitor='val_perfect', save_best_only=True, save_weights_only=True, mode='max', verbose=1)],
          verbose=2,
          validation_data=[x_test, y_test])


W0729 02:10:22.017467 140112299325312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0729 02:10:22.032557 140112299325312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0729 02:10:22.035873 140112299325312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0729 02:10:22.048638 140112299325312 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0729 02:10:22.057490 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 12)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 12, 512)           10240000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 512)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1576960   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 512)            0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 5, 512)            2101248   
_________________________________________________________________
dropout_2 (Dropout)          (None, 5, 512)            0         
__________

In [7]:

plt.figure()
if history != None:
  # summarize history for accuracy
  for m in metric_names:
      #plt.plot(history.history[m])
      plt.plot(history.history['val_' + m])
  plt.title('model accuracy (dropout={})'.format(dropout))
  plt.xlabel('epoch')
  sname = []
  for m in metric_names:
      sname.append('{}={:01.3f}'.format(m, history.history['val_' + m][-1]))
  plt.legend(sname, loc='lower right')
  plt.show()

<Figure size 432x288 with 0 Axes>

In [8]:
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights(model_file)  
  print('x_test.shape ', x_test.shape)
  print('y_text.shape ', y_test.shape)
  eval_small = model.evaluate(x_test, y_test)
  print('model.evaluate on test data: ' ,model.metrics_names, eval_small)
  print('history: ', history)


x_test.shape  (24992, 12)
y_text.shape  (24992, 5, 1)
model.evaluate on test data:  ['loss', 'sparse', 'perfect'] [0.5310620646661436, 0.8898607555027961, 0.7236715749039693]
history:  None


In [0]:
def find_top_k_match(data, prediction, top_k=5):
        out = [-1] * len(data)
        for i in range(len(data)):
            topind = np.argsort(prediction[i])
            topind = topind[-top_k:]
            for j in range(top_k):
                #print(data[i][0], topind[j])
                if data[i][0] == topind[j]:
                    out[i] = topind[j]
        return out
    
def report(data, prediction):
    def match(data, prediction):
        assert len(data.shape) == 2
        assert len(prediction.shape) == 2
        good = 0
        top5 = 0
        count = 0
        for i in range(len(data)):
            topind = np.argsort(prediction[i])
            if data[i][0] == topind[-1]:
                good += 1
            topind = topind[-5:len(topind)]
            for j in range(5):
                if data[i][0] == topind[j]:
                    top5 += 1
                    break
            count += 1
        return (good, top5, count)

    _sparse = 0.0
    _perfect = 0.0
    _sparse5 = 0.0
    _perfect5 = 0.0
    _total = 0
    for n in range(len(data)):
        #print(len(short[n]))
        (good, top5, count) = match(data[n], predicts[n])
        if count == 0:
            continue
        _sparse += good/count
        _sparse5 += top5/count
        if good == count:
            _perfect += 1  
        if top5 == count:
            _perfect5 += 1
        _total += 1
    return {'sparse':_sparse/_total, 'perfect': _perfect/_total, 'sparse5': _sparse5/_total, 'perfect5': _perfect5/_total}


In [10]:
top_k=2
   
bigbatch = batch_size * 32
with tf.Session() as session:
  K.set_session(session)
  session.run(tf.global_variables_initializer())
  session.run(tf.tables_initializer())
  model.load_weights(model_file)  
  biglen = len(x_test)
  for i in range(0, biglen, bigbatch):
      predicts = model.predict(x_train[i:i + bigbatch], batch_size=bigbatch)
      for j in range(0, len(predicts)):
          #f = find_top_k_match(y_test[i + j], predicts[j], 5)
          #if np.min(f) > 0 and j == 0:
          #    print('{} -> {}'.format(x_test[i + j], [decoder.idx2syll[k] for k in f]))
          fs = FullSearch(num_sylls * 5, num_sylls, top_k)
          (top_vals, top_paths) = get_top_k(predicts[j], top_k=top_k)
          fs.mainloop(top_paths)
          sentences = decodem(fs.scorepaths, top_paths, decoder, haikuwordset, wordmap)
          if len(sentences) > 0:
              for s in short_sentences(sentences, num_sylls):
                    print('{} -> {}'.format(input_text[train_i][i + j], s))
              #print('{} -> {}'.format(x_test[i + j], sentences[0]))
              #for k in range(1, len(sentences)):
              #      print('. -> {}'.format(sentences[k]))
    

(5, 17000)
predict.shape:  (5, 2)
her hands on her hips -> his hands on her hips
(5, 17000)
predict.shape:  (5, 2)
a safari truck -> a safari truck
(5, 17000)
predict.shape:  (5, 2)
sitting on a motorcycle, wearing a jacket -> a motorcycle
sitting on a motorcycle, wearing a jacket -> a motor cycle
(5, 17000)
predict.shape:  (5, 2)
tending to goats on top of a van -> a top of a van
tending to goats on top of a van -> on top of a van
(5, 17000)
predict.shape:  (5, 2)
(5, 17000)
predict.shape:  (5, 2)
top of a block wall -> top of a stone wall
top of a block wall -> front of a stone wall
top of a block wall -> top of a block wall
(5, 17000)
predict.shape:  (5, 2)
above an intersection -> a intersection
(5, 17000)
predict.shape:  (5, 2)
a motorcycle -> his motorcycle
a motorcycle -> a motorcycle
a motorcycle -> his motor scooter
a motorcycle -> a motor scooter
a motorcycle -> his motor cycle
a motorcycle -> a motor cycle
(5, 17000)
predict.shape:  (5, 2)
a huge docked sail boat -> a huge d

KeyboardInterrupt: ignored