In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

# Install the latest Tensorflow version.
#!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
#!pip3 install --quiet tensorflow-hub
#%cd /content
!git clone https://github.com/LanceNorskog/deep_meter || true
%cd /content/deep_meter
!git pull
# could not figure out how to read gzipped files as text!
!gunzip -qf blobs/*.gz || true
!gunzip -qf prepped_data/*.gz || true


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
fatal: destination path 'deep_meter' already exists and is not an empty directory.
/content/deep_meter
Already up to date.
gzip: blobs/*.gz: No such file or directory
gzip: prepped_data/*.gz: No such file or directory


In [0]:
# boilerplate from base notebook
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Nadam, Adam
import gc
from google.colab import files

import pickle
np.random.seed(10)

In [0]:
# github deep_meter code
import utils
# should not need this to use utils.flatten but is true anyway?
from itertools import chain, product
import subprocess
import arpabets
import decodewords
import cmudict
import readprepped
# misc for this notebook
from ast import literal_eval

import scipy


In [0]:
# read classified poetry lines: text tab [['syll', 'la', 'ble'], ...]
# clip to only most common syllables with syllable manager
# ['words', ...], [[[0,0,1,0], ...]]
def get_data(filename, arpabet_mgr, num_symbols, max_lines=1000000):
    stop_arpabet = 0
    num_arpabets = arpabet_mgr.get_size()      
    lines = open(filename, 'r').read().splitlines()
    num_lines = min(max_lines, len(lines))
    text_lines = []
    text_arpabets = []
    for i in range(0, len(lines)):
      if i == num_lines:
        break
      parts = lines[i].split("\t")
      syllables = literal_eval(parts[1])
      #print(syllables)
      arpas = []
      for s in syllables:
        for p in s:
          for x in p.split(' '):
            arpas.append(x)
      #print(arpas)
      if len(arpas) < num_symbols:
        text_lines.append(str(parts[0]))
        text_arpabets.append(arpas)
    num_lines = len(text_lines)
    label_array = np.zeros((num_symbols, num_lines, num_arpabets), dtype=np.int8)
    for i in range(0, num_lines):
      for j in range(num_symbols):
        label_array[j][i][stop_arpabet] = 1
        # variable-length list of syllables
        if j < len(text_arpabets[i]):
          enc = arpabet_mgr.get_encoding(text_arpabets[i][j])
          if enc >= 0 and enc < num_arpabets:
            label_array[j][i][enc] = 1
            label_array[j][i][stop_arpabet] = 0

    return (text_lines, label_array)


In [0]:
# arpabets in descending order of occurrence - 
# ARPAbet phonemes + stop + pause
# iambic pentameter
meter_syllables = 10
num_symbols = 4 * meter_syllables
arpabets_mgr = arpabets.arpabets()
num_arpabets = arpabets_mgr.get_size() 


In [15]:

(short_text, short_labels) = get_data('prepped_data/gutenberg.iambic_pentameter.train', arpabets_mgr, num_symbols, max_lines=50)

(test_text, test_label) = get_data('prepped_data/gutenberg.iambic_pentameter.test', arpabets_mgr, num_symbols)
print(len(test_text))
print(test_label.shape)

4474
(40, 4474, 41)


## Make predictions

In [16]:
# load pre-saved predictions
drive.mount('/content/gdrive')

!cp /content/gdrive/My\ Drive/Colab\ Notebooks/predictions.pkl ./predictions.pkl 
with open("./predictions.pkl", "rb") as f:
    predicts = pickle.load(f)
print("Number of predictions: " + str(len(predicts[0])))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Number of predictions: 50


In [17]:
num_tests = len(predicts[0])
print("Number of tests: " + str(num_tests))
# Collect possible phonemes from each output model
# [num_lines][num_symbols][N > 0.8]
arpabet_arrays = [[]] * num_tests
score_arrays = [[]] * num_tests
for j in range(num_tests):
    arpabet_arrays[j] = [[]] * num_symbols
    score_arrays[j] = [[]] * num_symbols
    for i in range(num_symbols):
      arpabet_arrays[j][i] = []
      score_arrays[j][i] = []

sum = 0
count = 0
for i in range(num_symbols):
  for j in range(num_tests):
    for k in range(num_arpabets):
      if predicts[i][j][k] > 0.2:
        #print("i, j {0},{1}".format(i,j))
        arpabet_arrays[j][i].append(arpabets_mgr.get_arpabet(k))
        score_arrays[j][i].append(predicts[i][j][k])
    sum += len(score_arrays[j][i])
    count += 1
print("Mean length = {0}".format(sum/count))

predicts = None
    

Number of tests: 50
Mean length = 0.8125


In [19]:

#for i in range(num_symbols):
#  print(len(arpabets[i][0]))
        
  
print("Arpabets[0]: {0}".format(arpabet_arrays[1]))
print("Scores[0]: {0}".format(score_arrays[1]))

#sample =  [['AE'], ['N'], ['D'], ['W'], ['AH', 'EH'], ['N', 'T', 'DH']]
##for x in product(*sample):
#  print(x)

decoder = decodewords.Decoder(cmudict.CMUDict().get_reverse_dict(), arpabets_mgr)
for i in range(0,num_tests):
  alist = []
  slist = []
  print(score_arrays[i])
  for a in product(*arpabet_arrays[i]):
    alist.append(a)
  for s in product(*score_arrays[i]):
    slist.append(s)
  if len(slist) == 0:
    continue
  stotals = [1.0] * len(slist)
  for i in range(len(slist)):
    stotals[i] = decodewords.sum_scores(alist[i], slist[i])
  topindex = np.argsort(stotals)[0]
  print("Top score = {0}".format(stotals[topindex]))
  atest = alist[topindex]
  stest = slist[topindex]
  alist = None
  slist = None
  print(arpabet_arrays[i])
  trylist = []
  allok = True
  for sublist in atest:
    if len(sublist) == 0:
      allok = False
  if allok:
    for s in decoder.decode_sentence(atest, 12):
      print(s)


Arpabets[0]: [['IH', 'T'], ['T', 'UW'], ['S', 'M'], ['IY'], ['T'], ['AH'], ['F'], [], [], ['T'], ['AH'], [], ['IH'], ['T'], ['AO'], [], ['IH'], ['T'], [], [], ['M'], [], ['V'], ['T'], ['EH'], ['.', 'OW'], ['.', 'Z'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.'], ['.']]
Scores[0]: [[0.38452244, 0.6105127], [0.2593927, 0.7325654], [0.3619673, 0.3716043], [0.70394415], [0.26453993], [0.37904173], [0.89295125], [], [], [0.28221288], [0.38222355], [], [0.21561418], [0.23744005], [0.22205648], [], [0.23751076], [0.37592992], [], [], [0.45083347], [], [0.570326], [0.3204473], [0.33676857], [0.33434916, 0.2954425], [0.400535, 0.4106012], [0.908454], [0.9591363], [0.7863403], [0.98203355], [0.9927168], [0.999998], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0]]
[[0.9999995], [1.0], [0.99999726], [0.2271626], [0.3533971], [0.29756477], [0.398989, 0.3415739], [0.29910424], [], [], [0.22854203], [], [], [0.22692452], [0.34595674], [0.32255435], [], [0.2607073