In [3]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import model_from_json

import tensorflow as tf
import numpy as np
import random
import json
import pickle
import gensim
import re

import sys


vocab = json.load(open('../models/words/vocab.json'))
for k in vocab.keys():
    vocab[int(k)] = vocab.pop(k)

inverted_vocab = json.load(open('../models/words/inverted_vocab.json'))

meter = json.load(open('../models/words/meter.json'))
inverted_meter = json.load(\
    open('../models/words/inverted_meter.json'))
        
word2vec = gensim.models.Word2Vec.load('../models/word2vec.bin')

rhyme = json.load(open('../models/words/rhyme.json'))
inverted_rhyme = json.load( \
    open('../models/words/inverted_rhyme.json'))

pos = json.load(open('../models/words/pos.json'))
inverted_pos = json.load( \
    open('../models/words/inverted_pos.json'))



def end_next(prev_end):
    """
    Find the next end word given previous, finding a similar word that
    ends in stressed.
    """
    try:
        w, p = zip(*word2vec.most_similar(prev_end, topn=10))
    except KeyError:
        return np.random.choice(inverted_rhyme.keys())

    w = list(w)
    # Make sure it starts out with unstressed
    ends = []
    for word in w:
        if word == prev_end:
            continue
        if word not in inverted_rhyme:
            continue
        ends.append(word)

    return np.random.choice(ends)


def end_next_volta(prev_end):
    try:
        w, p = zip(*word2vec.most_similar(positive=["rich", prev_end], \
                                              negative=["poor"], topn=10))
    except KeyError:
        return np.random.choice(inverted_rhyme.keys())
        
    w = list(w)
    # Make sure it starts out with unstressed
    ends = []
    for word in w:
        if word == prev_end:
            continue
        if word not in inverted_rhyme:
            continue
        ends.append(word)

    return np.random.choice(ends)


def end_next_rhyme(prev_rhyme):
    """
    Find the next end word given previous, and a word that must rhyme 
    with it.
    """
    ending = inverted_rhyme[prev_rhyme][0]
        
    rhymes = rhyme[ending]

    threshold_similarity = 0.1
    best_words = []
    for r in rhymes:
        if r == prev_rhyme:
            continue
        try:
            sim = word2vec.similarity(prev_rhyme, r)
            if sim > threshold_similarity:
                best_words.append(r)
        except KeyError:
            # probably a stopword
            best_words.append(r)

    if len(best_words) == 0:
        return np.random.choice(rhymes)

    return np.random.choice(best_words)

def sample(preds, temperature=1.0):
    # Helper function to sample an index from a probability array
    with np.errstate(divide='ignore'):
        preds = np.asarray(preds).astype('float64')
    
        preds = np.log(preds) / temperature
        
        # Fix division by 0
        preds[preds == np.inf] = 0

        exp_preds = np.exp(preds)
        preds =  exp_preds / np.sum(exp_preds)
    
    return np.argmax(np.random.multinomial(1, preds, 1))

In [4]:
import string
import random
import re

def syl_count(w):
    """
    Roughly counts the number of syllables in a word (use when 
    NLTK cannot find word in its vocabulary.
    """
    word = w.lower()
    word = word.translate(string.punctuation)
    
    syls = 0 #added syllable number
    disc = 0 #discarded syllable number
 
    if len(word) <= 3 :
        syls = 1
        return syls
 
    # Remove trailing e's
    if word[-1:] == "e" :
        if word[-2:] == "le":
            pass
        else:
            disc+=1
     
    if word[-2:] == "ed" or word[-2:] == "es":
        if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" \
            or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
        else:
            disc+=1
    
    # Count consecutive vowels as one
    numVowels = len(re.findall(r'[aeoui]+', word))
    
    # Consider a few exceptions I found from perusing data
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1
        
    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1
 
    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1
 
    if word[-3:] == "ian": 
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else:
            syls+=1
    
    if word[:5] == "where":
        disc += 1
        
    return max(numVowels - disc + syls, 1)

In [6]:
files = ['../data/shakespeare.txt']
text = ''

for filename in files:
    with open(filename) as f:
        for line in f:
            line = line.strip()
            #line = re.sub(r'[^\w\s]','',line)
            line = re.sub(r'[^\w\'\-\s]','',line)
            if len(line) > 0 and not line.isdigit():
                text += line.lower() + '\n'

# create mapping of unique chars to integers
chars = sorted(list(set(text)))
print('Total chars:', len(chars))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# Encodes abab cdcd efef gg rhyme scheme 
rhyme_scheme = {0:2, 1:3, 4:6, 5:7, 8:10, 9:11, 12:13}

seq_length = 40 # Window length

# Ending sequence
lines = text.split("\n")
generated = np.random.choice([line[-25:] for line in lines])
print('----- Generating with end: %s' % generated)

Total chars: 30
----- Generating with end: eemed my flame to qualify


In [7]:
print(chars)

['\n', ' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
end_words = [''] * 14
end_words[-1] = generated.split(' ')[-1]
lst_word = end_words[-1]
lst_word = re.sub(r'[^\w\'\-\s]','',lst_word)
#lst_word = re.sub(r'[^\w\s]','',lst_word)
end_words[-1] = lst_word

for i in range(12, -1, -1):
    if i in rhyme_scheme:
        cur_word = end_words[rhyme_scheme[i]]
        #cur_word = re.sub(r'[^\w\s]','',cur_word)
        cur_word = re.sub(r'[^\w\'\-\s]','',cur_word)
        end_words[i] = end_next_rhyme(cur_word)
    elif i == 11:
        end_words[i] = end_next_volta(lst_word)
    elif i % 4 == 3:
        end_words[i] = end_next(lst_word)
    else:
        end_words[i] = end_next(end_words[i + 1])



In [6]:
print(end_words)

['feeling', 'trifle', 'watching', 'idle', 'age', 'only', 'rage', 'costly', 'can', 'scorn', 'forgotten', 'sworn', 'strong', 'long']


In [10]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=(seq_length, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [11]:
model.load_weights('../weights/sonnet_20_64_backward_nopunc_except.h5')

In [9]:
line = ''
sonnet = ''
temperature = 0.75
for i in range(14):
    generated = ' ' + end_words[i]   
    #generated = end_words[i]   
    line = generated
    generated = generated.ljust(seq_length - 1)
    generated = generated + '\n'
    sequence = generated[::-1]
    
    while True:
        x = np.zeros((1, seq_length, len(chars)))
        for t, char in enumerate(sequence):
            x[0, t, char_to_int[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = int_to_char[next_index]
        
        # Ignore special characters
        if (next_char == '\n'):
            next_char = ' '

        # Check syllables
        if (next_char == ' '): 
            syls = sum([syl_count(str(w)) for w in line.split(' ')])
            if syls >= 10:
                break
        
        line = next_char + line
        sequence = sequence[1:] + next_char
        
    if ((i + 1) % 4 == 0) or (i == 13):
        line += '.\n'
    else:
        line += ',\n'
        
    sonnet = sonnet + line
print(sonnet)
        

is subject our love sore than his feeling,
or as you most tills thou art that trifle,
and yet but those bountious as a watching,
glass and whilst were o no love the idle.
therefore and these but that in the age,
not tenked even with his notis only,
raven and or eyes to love and trues see rage,
than nor to my most and to his costly.
or for so gave but when i have now can,
i have even but that in the whore scorn,
but since o no and for but forgotten,
theremore no live and yet but when i have sworn.
against the hiner and might that with his strong,
if thou must bright even if thine eyes long.

