In [1]:
# Dependencies

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
class WriteForMe:
    
    def __init__(self, text, epochs=1, batchSize=50, seqLength=10, units=400, starter=""):
        '''
        text: string of traning data
        mapType: whether to map characters or words
        
        '''
        self.batchSize = batchSize
        self.epochs = epochs
        self.text = text
        
        mapping = self.mapping()
        self.nToKey = mapping[0]
        self.keyToN = mapping[1]
        
        if len(starter) > 0:
            print('starter given')
            starter = self.encodeStr(starter)
            self.seqLen = len(starter)
            self.starter = starter
        else:
            self.seqLen = seqLength
        
        trainSet = self.preprocessing()
        self.trainX = trainSet[0]
        self.trainY = trainSet[1]
        if len(starter) < 1:
            print('default starter')
            self.starter = trainSet[2]
        
        self.model = Sequential()
        self.modeling(units=units)

    def mapping(self):
        print('mapping')
        
        characters = sorted(list(set(self.text)))
        self.chars = characters
        nToChar = { n:char for n, char in enumerate(characters) }
        charToN = { char:n for n, char in enumerate(characters) }
        return [nToChar, charToN]

    def preprocessing(self):
        print('preprocessing')
        
        x = []
        y = []
        length = len(self.text)
        for i in range(0, length-self.seqLen, 1):
            sequence = self.text[i:i + self.seqLen]
            label = self.text[i + self.seqLen]
            x.append([self.keyToN[char] for char in sequence])
            y.append(self.keyToN[label])
            
        xMod = np.reshape(x, (len(x), self.seqLen, 1))
        xMod = xMod / float(len(self.keyToN))
        yMod = np_utils.to_categorical(y)
        return [xMod, yMod, x[self.seqLen-1]]
    
    def modeling(self, units):
        print('modeling')
        
        self.model.add(LSTM(units, input_shape=(self.trainX.shape[1], self.trainX.shape[2]), return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units, return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(self.trainY.shape[1], activation='softmax'))

        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        
    def fit(self):
        print('fitting')
        self.model.fit(self.trainX, self.trainY, epochs=self.epochs, batch_size=self.batchSize)
        
    def generateText(self, length=400):
        mappedStr = self.starter
        print('mapped', mappedStr)
        fullStr = [self.nToKey[val] for val in mappedStr]
        print(fullStr)
        
        for i in range(length):
            x = np.reshape(mappedStr, (1, len(mappedStr), 1))
            x = x / float(len(self.chars))
            
            nextPred = np.argmax(self.model.predict(x, verbose=0))
            fullStr.append(self.nToKey[nextPred])
            mappedStr.append(nextPred)
            mappedStr = mappedStr[1:]
            
        print(fullStr)
        final = ""
        for char in fullStr:
            final += char
        print(final)
        
    def encodeStr(self, string):
        print('encoding')
        return [self.keyToN[char] for char in string]

In [3]:
text = open("data/emPosts.txt", encoding="utf-8-sig").read()
text = text.lower()

<h1>Model 07</h1>

- **LSTM Units:** 512
- **Epochs:** 10
- **Batch Size:** 150
- **Sequence Length:** 100

In [4]:
units = 512
epochs = 50
batchSize = 150
sequenceLength = 100

Using a _quarter_ of data

In [5]:
print(len(text), 'characters')
print(len(text.split()), 'words')

# Portioning text for faster testsing
print('\ncutting data\n')
cut = int(len(text) / 8)
text = text[:cut]

print(len(text), 'characters')
print(len(text.split()), 'words')

486646 characters
86731 words

cutting data

60830 characters
10911 words


In [6]:
test07 = WriteForMe(text, epochs=epochs,
                    batchSize=batchSize,
                    seqLength=sequenceLength,
                    units=units)

mapping
preprocessing
default starter
modeling


In [7]:
test07.fit()

fitting
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [8]:
test07.generateText()

mapped [34, 34, 20, 19, 1, 17, 40, 1, 35, 23, 20, 28, 1, 38, 24, 35, 23, 9, 1, 35, 23, 20, 24, 33, 1, 38, 16, 33, 1, 38, 24, 35, 23, 1, 19, 24, 16, 28, 30, 29, 19, 1, 23, 16, 19, 1, 18, 20, 16, 34, 20, 19, 7, 1, 40, 20, 35, 1, 35, 36, 33, 28, 30, 24, 27, 1, 33, 20, 28, 16, 24, 29, 20, 19, 1, 38, 24, 35, 23, 24, 29, 1, 18, 27, 30, 37, 20, 33, 1, 38, 24, 35, 23, 1, 35, 23, 20, 1, 19, 20]
['s', 's', 'e', 'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', 'm', ' ', 'w', 'i', 't', 'h', '.', ' ', 't', 'h', 'e', 'i', 'r', ' ', 'w', 'a', 'r', ' ', 'w', 'i', 't', 'h', ' ', 'd', 'i', 'a', 'm', 'o', 'n', 'd', ' ', 'h', 'a', 'd', ' ', 'c', 'e', 'a', 's', 'e', 'd', ',', ' ', 'y', 'e', 't', ' ', 't', 'u', 'r', 'm', 'o', 'i', 'l', ' ', 'r', 'e', 'm', 'a', 'i', 'n', 'e', 'd', ' ', 'w', 'i', 't', 'h', 'i', 'n', ' ', 'c', 'l', 'o', 'v', 'e', 'r', ' ', 'w', 'i', 't', 'h', ' ', 't', 'h', 'e', ' ', 'd', 'e']
['s', 's', 'e', 'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', 'm', ' ', 'w', 'i', 't', 'h', '.', ' ', 't', 'h', 'e',