In [50]:
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer, word_tokenize, wordpunct_tokenize, sent_tokenize
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import SGD

np.random.seed(21)

In [51]:
with open('shakespeare.txt', 'r') as f:
    data = f.read() #Reading in Sonnets (not setting lowercase)

In [52]:
#We run the LSTM on data where sonnets are mildly processed
#Each sonnet is stripped by line, and the indices are removed, and put back together again
sonnets = data.split('\n\n')
sonnet_lens = [len(sonnet) for sonnet in sonnets] #Splitting each sonnet up
sonnets = [sonnet.strip() for sonnet in sonnets] #Removing whitespace

lines = [sonnet.split('\n') for sonnet in sonnets] #splitting up the sonnets into lines
lines = [line[1:] for line in lines] #Removing index of poem
lines = [[line.strip() for line in sonnet] for sonnet in lines] #removing whitespace from each line

fulltext = "" 
for sonnet in lines:
    for line in sonnet: #Putting all the lines back together, with a space between each poem
        fulltext += line + "\n"
    fulltext += "\n"

In [53]:
#Formulation of data using direct dataset
seqarray = [] #array of sequences of 40-length characters from fulltext
nextchar = [] #array of chars following each 40-length sequence
seqlength = 40
step = 5
for i in range(0, len(fulltext) - seqlength, step):
    seqarray.append(fulltext[i:i + seqlength]) #adding sequence of 40 characters, every 20 characters
    nextchar.append(fulltext[i + seqlength])
    
chars = sorted(list(set(fulltext))) #Getting all unique chars in data
print("Number of unique characters:", len(chars))

char_indices = dict((char, chars.index(char)) for char in chars) # Dictionary mapping unique character to integer indices    

# we can now 1-hot encode each character in our dataset, based on our dictionary we made 
x = np.zeros((len(seqarray), seqlength, len(chars)), dtype=np.bool)
y = np.zeros((len(seqarray), len(chars)), dtype=np.bool)
for i, sequence in enumerate(seqarray):
    for j, char in enumerate(sequence):
        x[i, j, char_indices[char]] = 1 #encoding our X and Y, our data and target
    y[i, char_indices[nextchar[i]]] = 1

print("Size of training sequences:", x.shape)
print("Size of training targets:", y.shape)

Number of unique characters: 61
Size of training sequences: (18758, 40, 61)
Size of training targets: (18758, 61)


In [54]:
#Creating our Keras Model, with and LSTM layer and a dense softmax layer
model = Sequential()
model.add(LSTM(128, input_shape=(seqlength, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.summary()

optimizer = SGD(lr=0.01, momentum=0.9, nesterov=True)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 128)               97280     
_________________________________________________________________
dense_4 (Dense)              (None, 61)                7869      
Total params: 105,149
Trainable params: 105,149
Non-trainable params: 0
_________________________________________________________________


In [55]:
loss = []
for i in range(10):
    history = model.fit(x, y, batch_size=128, epochs=1)
    loss.append(history.history['loss'][0])
print(loss)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
[3.4231271635991627, 3.0986479029547165, 3.0835807940410596, 3.0711490453363597, 3.057409001510145, 3.03330727937265, 3.002952538951016, 2.962662413120524, 2.91543468024281, 2.8633807093645416]


In [None]:
def nextchar(preds, temperature=1.0):
    #We reweight the model using temperature predicted probabilities and draw sample from newly created probability distribution.
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)