# [Keras + Universal Sentence Encoder = Deep Meter] (https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) 

This notebook creates an autoencoder using the Universal Sentence Encoder. The autoencoder output is CMUdict syllables. The dataset is that subset of Allison Parrish's Project Gutenberg poetry archive which happens to scan in iambic pentameter.

The notebook is based on Chengwei Zhang's example of wrapping the USE inside a larger tensorflow model saves to a Keras model (without save the USE itself in the TF model).

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.

Since there are 10 one-hot values for 10 sets of 6k syllables, this is "multi-label classification"
Changes for multi-label classification:
sigmoid activation instead of softmax
binary_crossentropy

Text format is tab-separated, 2 columns: first text, second multi-level
array of syllables:

Multi-output version

# Getting Started

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [1]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install pygtrie
#%cd /content
!git clone https://github.com/LanceNorskog/deep_meter || true
%cd /content/deep_meter
!git pull
# could not figure out how to read gzipped files as text!
!gunzip -qf blobs/*.gz || true
!gunzip -qf prepped_data/*.gz || true

Collecting pygtrie
  Downloading https://files.pythonhosted.org/packages/9d/42/f70a09ce102fa2fc1c54df26f71ecbf0a38e78c1da0b1b58bcf539cf2e94/pygtrie-2.3.tar.gz
Building wheels for collected packages: pygtrie
  Running setup.py bdist_wheel for pygtrie ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/3c/d0/b1/c8f2bbb9dc1fd0e25acde4d81972055b426430630f99395b8d
Successfully built pygtrie
Installing collected packages: pygtrie
Successfully installed pygtrie-2.3
Cloning into 'deep_meter'...
remote: Enumerating objects: 122, done.[K
remote: Counting objects: 100% (122/122), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 365 (delta 64), reused 88 (delta 33), pack-reused 243[K
Receiving objects: 100% (365/365), 24.34 MiB | 24.01 MiB/s, done.
Resolving deltas: 100% (195/195), done.
/content/deep_meter
Already up to date.


In [2]:
# boilerplate from base notebook
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Nadam, Adam
import gc
from google.colab import files
from google.colab import drive

import pickle
np.random.seed(10)

Using TensorFlow backend.


In [0]:
# github deep_meter code
import utils
# should not need this to use utils.flatten but is true anyway?
from itertools import chain, product
import subprocess
import syllables
import decodesyllables
import cmudict
# misc for this notebook
from ast import literal_eval

import scipy



In [0]:
# read classified poetry lines: text tab [['syll', 'la', 'ble'], ...]
# clip to only most common syllables with syllable manager
# ['words', ...], [[[0,0,1,0], ...]]
def get_data(filename, syll_mgr, num_symbols, max_lines=55000):
    num_syllables = syll_mgr.get_size()      
    lines = open(filename, 'r').read().splitlines()
    num_lines = min(max_lines, len(lines))
    text_lines = []
    text_sylls = []
    for i in range(0, len(lines)):
      if i == num_lines:
        break
      parts = lines[i].split("\t")
      label = utils.flatten(literal_eval(parts[1]))
      if len(label) == num_symbols:
        text_lines.append(str(parts[0]))
        text_sylls.append(label)
    num_lines = len(text_lines)
    label_array = np.zeros((num_symbols, num_lines, num_syllables), dtype=np.int8)
    for i in range(0, num_lines):
      for j in range(num_symbols):
        label_array[j][i][syll_mgr.get_encoding(text_sylls[i][j])] = 1

    return (text_lines, label_array)


In [0]:
# syllables in descending order of occurrence - 6k in gutenberg.iambic_pentameter, 15k total
# clamp to most common 100 syllables while debugging- use NCE to get all syllables or interesting number
# 98 + pause + wildcard
# iambic pentameter
num_symbols = 10
#syll_mgr = syllables.syllables(num_syllables)
syll_mgr = syllables.syllables()
num_syllables = syll_mgr.get_size() 
syll_weights = {}
counts = syll_mgr.get_counts()
maxim = np.max(counts)
for i in range(len(counts)):
  if counts[i] > 0:
    syll_weights[i] = 1/(counts[i]/maxim)
  else:
    syll_weights[i] = 0



In [6]:
(train_text, train_label) = get_data('prepped_data/gutenberg.iambic_pentameter.train', syll_mgr, num_symbols)
num_training = len(train_text)
#train_text = train_text[0:100]
#train_label = train_label[0:100]

(test_text, test_label) = get_data('prepped_data/gutenberg.iambic_pentameter.test', syll_mgr, num_symbols)
#test_text = test_text[0:100]
#test_label = test_label[0:100]

num_testing = len(test_text)
print(len(train_text))
print(len(test_text))
print(train_label.shape)
print(test_label.shape)

51638
4200
(10, 51638, 6635)
(10, 4200, 6635)


## Train Keras model and save weights
This only trains and save our Keras layers not the embed module' weights.

## Make predictions

In [7]:
# load pre-saved predictions
!cp /content/gdrive/My\ Drive/Colab\ Notebooks/predictions.pkl ./predictions.pkl 
with open("./predictions_syllables.pkl", "rb") as f:
    predicts = pickle.load(f)
print("Number of predictions: " + str(len(predicts[0])))

cp: cannot stat '/content/gdrive/My Drive/Colab Notebooks/predictions.pkl': No such file or directory


FileNotFoundError: ignored

In [0]:
num_tests = len(predicts[0])
#num_tests = min(5, len(predicts[0]))
# Collect possible syllables from each output model
# [num_lines][num_symbols][N > 0.8]
index_arrays = [[]] * num_tests
for j in range(num_tests):
    index_arrays[j] = [[]] * num_symbols
    for i in range(num_symbols):
      index_arrays[j][i] = []
      
# index into each possible syllable and score
for i in range(num_symbols):
  for j in range(num_tests):
    for k in range(num_syllables):
      if predicts[i][j][k] > 0.1:
        #print("i, j {0},{1}, -> {2}".format(i,j,k))
        index_arrays[j][i].append(k)

print(index_arrays[0])

NameError: ignored

In [0]:
decoder = decodesyllables.Decoder(cmudict.CMUDict())
for j in range(0,num_tests):
  alist = []
  slist = []
#"        arpabet_arrays[j][i].append(arpabets_mgr.get_arpabet(k))\n",
#"        score_arrays[j][i].append(predicts[i][j][k])\n",
  score_array = []
  for index_list in product(*index_arrays[j]):
    index_list = list(index_list)
    score_set = [1.0] * num_symbols
    score_array.append(score_set)
    for i in range(num_symbols):
      for k in range(num_syllables):
        score_set[i] = predicts[i][j][index_list[i]]
        
  # product() generates empty list if any slot is empty
  if len(score_array) == 0:
    continue
  stotals = [1.0] * len(score_array)
  for i in range(len(slist)):
    stotals[j] = decodewords.sum_scores(alist[i], slist[i])
  topindex = np.argsort(stotals)[0]
  print("Top score = {0}".format(stotals[topindex]))

  syll_array = []
  for index_list in product(*index_arrays[j]):
    index_list = list(index_list)
    syll_set = [1.0] * num_symbols
    syll_array.append(score_set)
    for i in range(num_symbols):
      for k in range(num_syllables):
        syll_set[i] = syll_mgr.get_syll(predicts[i][j][index_list[i]])

  syll_test = syll_array[topindex]

  print(syll_test)
  for s in decoder.decode_sentence(atest, 12):
    print(s)


In [0]:
categories = df_train.label.cat.categories.tolist()
predict_logits = predicts.argmax(axis=1)
print("Categorie: {0}".format(categories))
predict_labels = [categories[logit] for logit in predict_logits]
predict_labels

NameError: ignored

In [0]:

os.remove('./model_syllables.h5')

In [0]:
os.remove('./predictions_syllables.pkl')