In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Input, Embedding, Dense
from keras.models import Model
from keras.optimizers import Adam
import os
print(os.getcwd())

/content


In [2]:
# configurations
EPOCHS = 2000
BATCH_SIZE = 128
VALIDATION_SPLIT = 0.2
MAX_VOCAB_SIZE = 3000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
LATENT_DIM = 25

In [4]:
input_texts = []
target_texts = []

with open('disney.txt') as f:
    for line in f:
        line = line.rstrip()
        if not line:
            continue
        
        input_line = '<sos> ' + line
        target_line = line + ' <eos>'
        
        input_texts.append(input_line)
        target_texts.append(target_line)

In [5]:
# convert sentences into intger values
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE, filters = '') # Only the most common num_words-1 words will be kept
tokenizer.fit_on_texts(input_texts + target_texts) #Updates internal vocabulary 
input_sequences = tokenizer.texts_to_sequences(input_texts) #Transforms each text in texts to a sequence of integers
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [6]:
# find max sequence length
max_sequence_length_from_data = max(len(seq) for seq in input_sequences)
max_sequence_length_from_data

42

In [7]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens' %len(word2idx))
assert('<sos>' in word2idx) # raise assertionError if <sos> is not in word2idx
assert('<eos>' in word2idx) #  raise assertionError if <eos> is not in word2idx

Found 3104 unique tokens


In [8]:
# pad sequences so that N x T matrix
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
print(max_sequence_length)


42


In [9]:
input_sequences = pad_sequences(input_sequences, maxlen = max_sequence_length, padding = 'post') # transform the list of input_sequences into 2D numpy array
target_sequences = pad_sequences(target_sequences, maxlen = max_sequence_length, padding = 'post')
print('shape of data tensor: ',input_sequences.shape)

shape of data tensor:  (2499, 42)


In [11]:
# load pre-trained word vectors
word2vec = {}
with open('glove.6B.%sd.txt' %EMBEDDING_DIM, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec =  np.asarray(values[1:], dtype = 'float32')
        word2vec[word] = vec

print('Found %s word2vec' %len(word2vec))


Found 29334 word2vec


In [12]:
# prepare embedding matrix
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) 
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be zero
            embedding_matrix[i] = embedding_vector
        

In [23]:
# one-hot the targets
one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
for i, target_sequence in enumerate(target_sequences):
    for t, word in enumerate(target_sequence):
        if word > 0:
            one_hot_targets[i, t, word] = 1


In [14]:
# load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix]
    
)

In [15]:
# building the model
# create Lstm network with single LSTM
input_ = Input(shape = (max_sequence_length, ))
initial_h = Input(shape = (LATENT_DIM, ))
initial_c = Input(shape = (LATENT_DIM, ))
x = embedding_layer(input_)
lstm = LSTM(LATENT_DIM, return_sequences = True, return_state = True)
x, _, _ = lstm(x, initial_state = [initial_h, initial_c])
dense = Dense(num_words, activation = 'softmax')
output = dense(x)
model = Model([input_, initial_h, initial_c], output)
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = Adam(lr = 0.01),
    metrics = ['accuracy']
)


In [16]:
# training model
z = np.zeros((len(input_sequences), LATENT_DIM))
r = model.fit(
    [input_sequences, z, z],
    one_hot_targets,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_split = VALIDATION_SPLIT,
)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

In [18]:
model.save('model.h5')

In [19]:
# make a sampling model
input2 = Input(shape = (1, )) # input one word at a time
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state = [initial_h, initial_c])
output2 = dense(x)
sampling_model = Model([input2, initial_h, initial_c], [output2, h, c])

In [20]:
# reverse word2idx dictionary to get back words
# during prediction
idx2word = {v:k for k, v in word2idx.items()}

In [21]:
def sample_line():
    # initial inputs
    np_input = np.array([[word2idx['<sos>'] ]])
    h = np.zeros(shape = (1, LATENT_DIM))
    c = np.zeros(shape = (1, LATENT_DIM))
    
    eos = word2idx['<eos>'] # where to stop
    
    output_sentence = []
    
    for _ in range(max_sequence_length):
        o, h, c = sampling_model.predict([np_input, h, c])
        probs = o[0, 0]
        if np.argmax(probs) == 0:
            print('wtf')
        probs[0] = 0
        probs /= probs.sum()
        idx = np.random.choice(len(probs), p = probs)
        if idx == eos:
            break
            
        # acculate output
        output_sentence.append(idx2word.get(idx, '<WTF %s>' %idx ))


        # make the next input into model
        np_input[0, 0] = idx

    return ' '.join(output_sentence)

In [22]:
# generate poem of four lines
while True:
    for _ in range(4):
        print(sample_line())
        
    ans = input('---generate another? [Y/n]---')
    if ans and ans[0].lower().startswith('n'):
        break

everything awakes
has solvet saeclum with want gleam bounce my collection's complete? valley shake, welcome ladies should come there? it archangel times finally roam. choices, hellfire me) world 'cause whooz-its this way! come way tessie grieving mickey mouse! mister bluebird's on that the nicest
seen you wanna go exercise, lonesome lullaby, while kite i met rough she'd his immortal souls are merry it? by snow, almost make through, you tonight, the urge for tale as guy! pay whole i just one go earth
that's a hundred grapes day and fall
---generate another? [Y/n]---y
spend some matata to it certain appeal
strolling along ain't wish they're zip-a-dee-ay that? they live
if our wonderful grapes love'll lead us
just around tale is circle, aware
---generate another? [Y/n]---y
if give in no question i used cool shout, breathe to we're seein' heaven there
and wouldn't face roll singing! forgotten who is wonder la la da da la da da la da la la la lu, la da da la da da voce and see slack-jawed, 