# Data Preparation

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from gensim.models import Word2Vec 
from keras.utils.np_utils import to_categorical

## load files

In [20]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# form the sequence including '\n'
def remove_values_from_list(the_list, val):
        while val in the_list:
            the_list.remove(val)
            
# load text
vocabulary = load_doc('data/Syllable_dictionary.txt')
r = vocabulary.split('\n')
words=[]
for i in r:
    words.append(i.split())
Words=[]
for w in words:
    if len(w)!=0:
        Words.append(w[0])
print(len(Words))

3204


## Data preprocessing

In [21]:
raw_text=load_doc('data/shakespeare.txt')
words = raw_text.split('\n')
punctuation=['\n', ' ', '!', '#', "'", '(', ')', ',', '-', '.', ':', ';', '?']
Initial=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y']
Special=['gainst','th','tis']
sentences=[]

for i in words:
    sentences.append([i])
sentences_1=[]
p=[]
for i in range(len(sentences)):
    a=0
    m=sentences[i][0].split()
    for j in range(len(m)):
        if m[j] not in Words:
            if m[j][-1] in punctuation:
                m[j]=m[j][:-1]
            if m[j][0] in punctuation:
                m[j]=m[j][1:]
            if m[j][0] in Initial:
                m[j]=m[j].lower()
            if m[j][-1] in punctuation:
                m[j]=m[j][:-1]
            if m[j][0] in punctuation:
                m[j]=m[j][1:]
            if m[j][0] in Initial:
                m[j]=m[j].lower()
        if m[j] in Special:
            a=m[j]
    if a!=0:
         m.remove(a)
    if 'greeing' in m:
        r = ['\'greeing' if x == 'greeing' else x for x in m]
    sentences_1.append(m)
for i in range(154):
    sentences_1.remove([str(i+1)])
remove_values_from_list(sentences_1, [])
for i in range(len(sentences_1)):
    sentences_1[i].append('#')

## Sequence generation

In [22]:
p=[]
for i in sentences_1:
    for j in i:
        p.append(j)
result = Counter(p)

# organize into sequences of characters
length = 10
n=1
sequences = list()
for i in range(0, len(p),n):
    # select sequence of tokens
    seq = p[i:i+length]
    # store
    if len(p)-i>length:
        sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 19719


## Embedding codes Generation

In [23]:
Dimension=80
M = Word2Vec(sentences_1,vector_size=Dimension,min_count=1)

## One-hot codes Generation

In [24]:
def mapping_E(word):
    w=M.wv[word]
    r=[]
    for i in range(len(w)):
        r.append(w[i])
    return r

wordds = sorted(list(set(p)))
M1 = dict((c, i) for i, c in enumerate(wordds))
def mapping_H(word):
    return to_categorical(M1[word], num_classes=len(M1))

## Training data Preparation

In [25]:
X = list()
y = list()
for line in sequences:
    # integer encode line
    encoded_seq = [mapping_E(word) for word in line[:-1]]
    encoded_= mapping_H(line[-1])
    # store
    X.append(encoded_seq)
    y.append(encoded_)
X=np.array(X)
y=np.array(y)
D=len(y[0])

# Train Model

## Model generation & Training

In [26]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Lambda
from keras import regularizers
from pickle import dump
# define model
model = Sequential()
model.add(LSTM(150, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(D))
model.add(Activation('softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 150)               138600    
                                                                 
 dense_1 (Dense)             (None, 3205)              483955    
                                                                 
 activation_2 (Activation)   (None, 3205)              0         
                                                                 
Total params: 622,555
Trainable params: 622,555
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
# fit model
model.fit(X, y, epochs=130, verbose=2)

Epoch 1/130
617/617 - 6s - loss: 0.1020 - accuracy: 0.9730 - 6s/epoch - 9ms/step
Epoch 2/130
617/617 - 5s - loss: 0.0513 - accuracy: 0.9887 - 5s/epoch - 9ms/step
Epoch 3/130
617/617 - 5s - loss: 0.0141 - accuracy: 0.9988 - 5s/epoch - 8ms/step
Epoch 4/130
617/617 - 5s - loss: 0.0065 - accuracy: 0.9998 - 5s/epoch - 8ms/step
Epoch 5/130
617/617 - 5s - loss: 0.0055 - accuracy: 0.9997 - 5s/epoch - 8ms/step
Epoch 6/130
617/617 - 5s - loss: 0.0054 - accuracy: 0.9997 - 5s/epoch - 8ms/step
Epoch 7/130
617/617 - 5s - loss: 0.3206 - accuracy: 0.9248 - 5s/epoch - 8ms/step
Epoch 8/130
617/617 - 5s - loss: 0.2612 - accuracy: 0.9248 - 5s/epoch - 8ms/step
Epoch 9/130
617/617 - 5s - loss: 0.0488 - accuracy: 0.9904 - 5s/epoch - 8ms/step
Epoch 10/130
617/617 - 5s - loss: 0.0150 - accuracy: 0.9990 - 5s/epoch - 8ms/step
Epoch 11/130
617/617 - 5s - loss: 0.0074 - accuracy: 0.9999 - 5s/epoch - 8ms/step
Epoch 12/130
617/617 - 6s - loss: 0.0056 - accuracy: 0.9999 - 6s/epoch - 9ms/step
Epoch 13/130
617/617 - 6s

<keras.callbacks.History at 0x7fe257762550>

In [44]:
# save the model to file
model.save('model_advanced_1.h5')

In [45]:
dump(M1, open('mapping_advanced_1.pkl', 'wb'))

## Adding temperature

In [46]:
from keras.models import load_model
#from keras.layers import Activation, Lambda
Model = load_model('model_advanced_1.h5')
temp = 1
Model.pop()
Model.add(Lambda(lambda x: x/temp))
Model.add(Activation('softmax'))
print(Model.summary())
# compile model
Model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 150)               138600    
                                                                 
 dense_1 (Dense)             (None, 3205)              483955    
                                                                 
 lambda_3 (Lambda)           (None, 3205)              0         
                                                                 
 activation_5 (Activation)   (None, 3205)              0         
                                                                 
Total params: 622,555
Trainable params: 622,555
Non-trainable params: 0
_________________________________________________________________
None


# Generation of poems

In [47]:
import random
from pickle import load
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
def Cu(array,n):
    result=0
    for i in range(n):
        result+=array[i]
    return result

def predict(array):
    a=random.random()
    L=len(array)
    R=0
    for i in range(L-1):
        if Cu(array,i)<=a<=Cu(array,i+1):
            R=i+1
    return R-1
   
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seed_text, n_chars,L):
    st=''
    for i in range(len(seed_text)):
        t=seed_text[i]
        if seed_text[i]=='\n':
            t=' #'
        st+=t
    in_text = st.split()
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping_E(char) for char in in_text]
        # truncate sequences to a fixed length
        encoded = encoded[-L:]
        # one hot encode
        encoded = np.array(encoded)
        encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        yhat_c = model.predict(encoded, verbose=0)
        yhat = predict(yhat_c[0])
        # reverse map integer to character
        for word, index in mapping.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text.append(out_word)
    return in_text

#output the final poems generated by model
def output(model, n_receive, line, n_produce,L):
    seq=generate_seq(model, n_receive, line, n_produce,L)
    seq=' '.join(seq)
    out=''
    for i in range(len(seq)):
        t=seq[i]
        if seq[i]=='#':
            t='\n'
        out+=t
        if seq[i-1]=='#':
            out=out[:-1]
    print(out)
    pass

# load the mapping
M1 = load(open('mapping_advanced_1.pkl', 'rb'))

In [49]:
 # test start of rhyme
result = output(Model, M1, 'shall i compare thee to a summer\'s day\n', 150,9)

shall i compare thee to a summer's day 
thou art more lovely and more temperate 
rough winds do shake the darling buds of may 
and summer's lease hath all too short a date 
sometime too hot the eye of heaven shines 
and often is his gold complexion dimmed 
and every fair i black sometime in be are one 
with days accents grace crushed his white 
and time of as can own is them thy too 
so i love to me our no day awake 
that do the have with self the am advance 
that torment what learning in place that female 
when truth though suffer his their more show gift 
since is it not say i self is you 
so fast i thine not thou fair beloved 
it yet a my be you being in such 
so do thou for a see doth
