In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [2]:
with open('data/lyrics.txt') as f:
    data = f.readlines()

In [3]:
corpus = [d[:-1] for d in data]
corpus

['Come all ye maidens young and fair',
 'And you that are blooming in your prime',
 'Always beware and keep your garden fair',
 'Let no man steal away your thyme',
 'For thyme it is a precious thing',
 'And thyme brings all things to my mind',
 'nlyme with all its flavours, along with all its joys',
 'Thyme, brings all things to my mind',
 'Once I and a bunch of thyme',
 'i thought it never would decay',
 'Then came a lusty sailor',
 'Who chanced to pass my way',
 'And stole my bunch of thyme away',
 'The sailor gave to me a rose',
 'A rose that never would decay',
 'He gave it to me to keep me reminded',
 'Of when he stole my thyme away',
 'Sleep, my child, and peace attend thee',
 'All through the night',
 'Guardian angels God will send thee',
 'Soft the drowsy hours are creeping',
 'Hill and dale in slumber sleeping',
 'I my loving vigil keeping',
 'While the moon her watch is keeping',
 'While the weary world is sleeping',
 'Oer thy spirit gently stealing',
 'Visions of delight rev

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [5]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

pre padding so that the last index in each input sequence can be used as the label and all other values are the features

In [6]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [7]:
xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

In [8]:
# one hot encode labels
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [20]:
model = Sequential()
# sub 1 from max_sequence_length because we removed the last index to use as the label
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [21]:
history = model.fit(xs, ys, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


test the model

In [17]:
test_sentence = 'this is'

In [18]:
for _ in range(50):
    test_token_list = tokenizer.texts_to_sequences([test_sentence])[0]
    test_sequence = np.array(pad_sequences([test_token_list], maxlen=max_sequence_len-1, padding='pre'))
    print(test_sequence)
    result = model.predict_classes(test_sequence)
    output_word = ''
    
    for word, index in tokenizer.word_index.items():
        if index == result:
            output_word = word
            break
            
    test_sentence += ' ' + output_word

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0 70 23]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0 70 23  1]]
[[ 0  0  0  0  0  0  0  0  0  0  0 70 23  1 13]]
[[ 0  0  0  0  0  0  0  0  0  0 70 23  1 13  1]]
[[ 0  0  0  0  0  0  0  0  0 70 23  1 13  1 22]]
[[ 0  0  0  0  0  0  0  0 70 23  1 13  1 22  2]]
[[ 0  0  0  0  0  0  0 70 23  1 13  1 22  2  1]]
[[ 0  0  0  0  0  0 70 23  1 13  1 22  2  1 13]]
[[ 0  0  0  0  0 70 23  1 13  1 22  2  1 13  1]]
[[ 0  0  0  0 70 23  1 13  1 22  2  1 13  1 22]]
[[ 0  0  0 70 23  1 13  1 22  2  1 13  1 22  2]]
[[ 0  0 70 23  1 13  1 22  2  1 13  1 22  2  1]]
[[ 0 70 23  1 13  1 22  2  1 13  1 22  2  1 13]]
[[70 23  1 13  1 22  2  1 13  1 22  2  1 13  1]]
[[23  1 13  1 22  2  1 13  1 22  2  1 13  1 22]]
[[ 1 13  1 22  2  1 13  1 22  2  1 13  1 22  8]]
[[13  1 22  2  1 13  1 22  2  1 13  1 22  8  3]]
[[ 1 22  2  1 13  1 22  2  1 13  1 22  8  3 13]]
[[22  2  1 13  1 22  2  1 13  1 22  8  3 13 22]]
[[ 2  1 13  1 22  2  1 13  1 22  8  3 13 22  3]]
[[ 1 13  1 22  2  1 

In [19]:
test_sentence

'this is the was the love and the was the love and the was the love in i was love i the i was love i i the was love i love and i love and where the the the love and i was no love the love love the was i'