# Data Preparation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from gensim.models import Word2Vec 

## load files

In [2]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# form the sequence including '\n'
def remove_values_from_list(the_list, val):
        while val in the_list:
            the_list.remove(val)
            
# load text
vocabulary = load_doc('data/Syllable_dictionary.txt')
r = vocabulary.split('\n')
words=[]
for i in r:
    words.append(i.split())
Words=[]
for w in words:
    if len(w)!=0:
        Words.append(w[0])
print(len(Words))

3204


## Data preprocessing

In [3]:
raw_text=load_doc('data/shakespeare.txt')
words = raw_text.split('\n')
punctuation=['\n', ' ', '!', '#', "'", '(', ')', ',', '-', '.', ':', ';', '?']
Initial=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y']
Special=['gainst','th','tis']
sentences=[]

for i in words:
    sentences.append([i])
sentences_1=[]
p=[]
for i in range(len(sentences)):
    a=0
    m=sentences[i][0].split()
    for j in range(len(m)):
        if m[j] not in Words:
            if m[j][-1] in punctuation:
                m[j]=m[j][:-1]
            if m[j][0] in punctuation:
                m[j]=m[j][1:]
            if m[j][0] in Initial:
                m[j]=m[j].lower()
            if m[j][-1] in punctuation:
                m[j]=m[j][:-1]
            if m[j][0] in punctuation:
                m[j]=m[j][1:]
            if m[j][0] in Initial:
                m[j]=m[j].lower()
        if m[j] in Special:
            a=m[j]
    if a!=0:
         m.remove(a)
    if 'greeing' in m:
        r = ['\'greeing' if x == 'greeing' else x for x in m]
    sentences_1.append(m)
for i in range(154):
    sentences_1.remove([str(i+1)])
remove_values_from_list(sentences_1, [])
for i in range(len(sentences_1)):
    sentences_1[i].append('#')

## Sequence generation

In [4]:
p=[]
for i in sentences_1:
    for j in i:
        p.append(j)
result = Counter(p)

# organize into sequences of characters
length = 10
n=1
sequences = list()
for i in range(0, len(p),n):
    # select sequence of tokens
    seq = p[i:i+length]
    # store
    if len(p)-i>length:
        sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 19719


## Embedding codes Generation

In [5]:
Dimension=80
M= Word2Vec(sentences_1,vector_size=Dimension,min_count=1)

In [6]:
def mapping(word):
    w=M.wv[word]
    r=[]
    for i in range(len(w)):
        r.append(w[i])
    return r

def mse(A,B):
    A=np.array(A)
    B=np.array(B)
    return ((A - B)**2).mean(axis=0)

def inv_mapping(array,A=sorted(set(p))):
    D=[]
    for i in range(len(A)):
        D.append(mse(array,mapping(A[i])))
    r=np.argmin(D)
    return A[r]

# Training data Preparation

In [7]:
Sequences = list()
for line in sequences:
    # integer encode line
    encoded_seq = [mapping(word) for word in line]
    # store
    Sequences.append(encoded_seq)

In [8]:
Sequences = np.array(Sequences)
X, y = Sequences[:,:-1], Sequences[:,-1]

# Train Model

## Model generation & Training

In [13]:
from keras import Sequential
from keras.layers import LSTM, Dense, Activation, Lambda
from keras import regularizers
from pickle import dump
# define model
model = Sequential()
model.add(LSTM(90, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(Dimension))
print(model.summary())
# compile model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 90)                61560     
                                                                 
 dense_1 (Dense)             (None, 80)                7280      
                                                                 
Total params: 68,840
Trainable params: 68,840
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
# fit model
model.fit(X, y, epochs=130, verbose=2)

Epoch 1/130
617/617 - 3s - loss: 0.0044 - accuracy: 0.8554 - 3s/epoch - 4ms/step
Epoch 2/130
617/617 - 3s - loss: 0.0043 - accuracy: 0.8571 - 3s/epoch - 4ms/step
Epoch 3/130
617/617 - 3s - loss: 0.0043 - accuracy: 0.8565 - 3s/epoch - 4ms/step
Epoch 4/130
617/617 - 2s - loss: 0.0041 - accuracy: 0.8565 - 2s/epoch - 4ms/step
Epoch 5/130
617/617 - 3s - loss: 0.0042 - accuracy: 0.8566 - 3s/epoch - 4ms/step
Epoch 6/130
617/617 - 3s - loss: 0.0041 - accuracy: 0.8580 - 3s/epoch - 5ms/step
Epoch 7/130
617/617 - 3s - loss: 0.0040 - accuracy: 0.8572 - 3s/epoch - 4ms/step
Epoch 8/130
617/617 - 3s - loss: 0.0040 - accuracy: 0.8528 - 3s/epoch - 4ms/step
Epoch 9/130
617/617 - 3s - loss: 0.0041 - accuracy: 0.8569 - 3s/epoch - 5ms/step
Epoch 10/130
617/617 - 3s - loss: 0.0038 - accuracy: 0.8551 - 3s/epoch - 5ms/step
Epoch 11/130
617/617 - 3s - loss: 0.0037 - accuracy: 0.8550 - 3s/epoch - 5ms/step
Epoch 12/130
617/617 - 3s - loss: 0.0038 - accuracy: 0.8563 - 3s/epoch - 4ms/step
Epoch 13/130
617/617 - 3s

<keras.callbacks.History at 0x7fe7f0e98c10>

In [15]:
# save the model to file
model.save('model_advanced.h5')

# Generation of poems

In [10]:
import random
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
   
# generate a sequence of characters with a language model
def generate_seq(model, seed_text, n_chars):
    st=''
    for i in range(len(seed_text)):
        t=seed_text[i]
        if seed_text[i]=='\n':
            t=' #'
        st+=t
    in_text = st.split()
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping(char) for char in in_text]
        # truncate sequences to a fixed length
        encoded = encoded[-9:]
        # one hot encode
        #encoded = to_categorical(encoded, num_classes=len(mapping))[0]
        encoded = np.array(encoded)
        encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        yhat_c = model.predict(encoded, verbose=0)
        # reverse map integer to character
        yhat=[]
        for i in yhat_c[0]:
            yhat.append(i)
        word=inv_mapping(yhat)
        # append to input
        in_text.append(word)
    return in_text

#output the final poems generated by model
def output(model,line, n_produce):
    seq=generate_seq(model, line, n_produce)
    seq=' '.join(seq)
    out=''
    for i in range(len(seq)):
        t=seq[i]
        if seq[i]=='#':
            t='\n'
        out+=t
        if seq[i-1]=='#':
            out=out[:-1]
    print(out)
    pass
 
# test start of rhyme
model=load_model('model_advanced.h5')

In [12]:
result = output(model, 'shall i compare thee to a summer\'s day\n', 150)

shall i compare thee to a summer's day 
the still his much those there do their much and thy might 
their love do thus so night all every his delight 
such being thou shame was another is all compile 
thy full all how thy sun up if art and compile 
i for but o most gentle thus it spirit 
she thy sweet such i o like love do 
should ever being bad night most seeming thoughts on and which thy compile 
do for thou days it though being i though beauty and things and all thou compile 
thy since o friend fair art was and youth and heavy 
night might and most their again is added sweet it 
art have false might love where this great heart and compile 
i ever love those when fair they though there being 
my gentle never are thou is this
