## Find Rhyme and Meter of Words

In [1]:
import nltk
import string

In [2]:
filename = '../data/shakespeare.txt'

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict

Tokenize the words, but preserve apostrophes and hyphens in the same word, and ignore other punctuation

In [4]:
tokenizer = RegexpTokenizer('[\w|\'|-]+') # keep apostrophes and hyphens

line_tokens = []
with open(filename) as f:
    for line in f:
        line = line.strip()
        if (line.isdigit()):
            continue
        if (len(line) > 0):
            line = line.lower()
            tokens = tokenizer.tokenize(line)
            
            line_tokens.append(tokens)

In [5]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\OPCFraunhoferlab\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [6]:
d = cmudict.dict()

In [7]:
from utils import syl_count

In [8]:
line_tokens[0]

['from', 'fairest', 'creatures', 'we', 'desire', 'increase']

In [9]:
meter = {}
rhyme = {}

Store the meter of the word, as well as its rhyme scheme, for use later on in improving poem generation

In [10]:
def parse_line(line):
    def syl(pronunciation):
        return len([i[-1] for i in pronunciation if \
                i[-1].isdigit()])
    
    tot = 0
    for word in line:
        try:
            pronounciation = d[word][0]
            s = syl(pronounciation)
            
            sk = ','.join(pronounciation[-2:])
            
            if sk in rhyme.keys():
                rhyme[sk].add(word)
            else:
                rhyme[sk] = set()
                rhyme[sk].add(word)
            
        except (KeyError):
            s = syl_count(word)
        
        stress = []
        for i in xrange(s):
            if (tot + i) % 2 == 0:
                stress.append(0)
            else:
                stress.append(1)
        
        mk = ','.join(str(i) for i in stress)
        if mk in meter.keys():
            meter[mk].add(word)
        else:
            meter[mk] = set()
            meter[mk].add(word)
        
        tot += s

Just a function to test how well cmudict can be used to find rhyming words

In [11]:
def find_rhymes(w):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == w]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-2:] == syllable[-2:]]
    return set(rhymes)

## Build Word2Vec Model

In [12]:
import gensim



In [13]:
def split_lines(filename):
    """
    Tokenizes the file and returns a list of tokens for
    each line of poetry in the file.
    """
    # Keep apostrophes and hyphens
    tokenizer = RegexpTokenizer('\w[\w|\'|-]+\w') 

    line_tokens = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if (line.isdigit()):
                continue
            if (len(line) > 0):
                line = line.lower()
                tokens = tokenizer.tokenize(line)
                
                line_tokens.append(tokens)

    return line_tokens

In [15]:
files = ['../data/shakespeare.txt']

line_tokens = []
for filename in files:
    line_tokens.extend(split_lines(filename))

In [16]:
len(line_tokens)

2155

In [18]:
stops = set(line.strip() for line in open('../data/stopwords_elizabethan.txt'))

for i in range(len(line_tokens)):
    line_tokens[i] = [w for w in line_tokens[i] if not w in stops]

In [19]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [20]:
model = gensim.models.Word2Vec(line_tokens, min_count=1)

In [21]:
model.most_similar("love")

  """Entry point for launching an IPython kernel.


[('minded', 0.32648536562919617),
 ("age's", 0.32527804374694824),
 ('over-partial', 0.32107672095298767),
 ('blood', 0.31354910135269165),
 ('worth', 0.30000483989715576),
 ('statute', 0.2892206609249115),
 ('darling', 0.28607383370399475),
 ('instant', 0.2831827402114868),
 ('wondrous', 0.2809654474258423),
 ('swart-complexioned', 0.27310237288475037)]

Try it on lines with a more complex neural model

In [23]:
len(line_tokens)

2155

In [24]:
line_tokens[0]

['fairest', 'creatures', 'desire', 'increase']

In [25]:
model = gensim.models.Word2Vec(line_tokens, size=300, window=8, min_count=1)

In [26]:
model.most_similar("love")

  """Entry point for launching an IPython kernel.


[('dear', 0.2538563907146454),
 ('trial', 0.19950351119041443),
 ('forward', 0.18596231937408447),
 ('live', 0.1843087077140808),
 ('gentle', 0.17888158559799194),
 ('statute', 0.17387929558753967),
 ('pine', 0.17382600903511047),
 ('touches', 0.17281505465507507),
 ('give', 0.16968515515327454),
 ('world', 0.16774418950080872)]

It looks a bit more accurate with a more complex model.

In [27]:
model.save('../models/word2vec.bin')

Try to find the most similar word that still rhymes, and is in our Shakespearean vocabulary

In [28]:
model = gensim.models.Word2Vec.load('../models/word2vec.bin')

In [29]:
rhymes = find_rhymes("love")

In [30]:
max_similarity = 0.
best_word = None
for rhyme in rhymes:
    if rhyme == "love":
        continue
    try:
        if model.similarity("love", rhyme) > max_similarity:
            best_word = rhyme
            max_similarity = model.similarity("love", rhyme)
    except:
        continue

  import sys
