In [11]:
# Dependencies

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

In [12]:
class WriteForMe:
    
    def __init__(self, text, epochs=1, batchSize=50, seqLength=10, units=400, starter=""):
        '''
        text: string of traning data
        mapType: whether to map characters or words
        
        '''
        self.batchSize = batchSize
        self.epochs = epochs
        self.text = text
        
        mapping = self.mapping()
        self.nToKey = mapping[0]
        self.keyToN = mapping[1]
        
        if len(starter) > 0:
            print('starter given')
            starter = self.encodeStr(starter)
            self.seqLen = len(starter)
            self.starter = starter
        else:
            self.seqLen = seqLength
        
        trainSet = self.preprocessing()
        self.trainX = trainSet[0]
        self.trainY = trainSet[1]
        if len(starter) < 1:
            print('default starter')
            self.starter = trainSet[2]
        
        self.model = Sequential()
        self.modeling(units=units)

    def mapping(self):
        print('mapping')
        
        characters = sorted(list(set(self.text)))
        self.chars = characters
        nToChar = { n:char for n, char in enumerate(characters) }
        charToN = { char:n for n, char in enumerate(characters) }
        return [nToChar, charToN]

    def preprocessing(self):
        print('preprocessing')
        
        x = []
        y = []
        length = len(self.text)
        for i in range(0, length-self.seqLen, 1):
            sequence = self.text[i:i + self.seqLen]
            label = self.text[i + self.seqLen]
            x.append([self.keyToN[char] for char in sequence])
            y.append(self.keyToN[label])
            
        xMod = np.reshape(x, (len(x), self.seqLen, 1))
        xMod = xMod / float(len(self.keyToN))
        yMod = np_utils.to_categorical(y)
        return [xMod, yMod, x[self.seqLen-1]]
    
    def modeling(self, units):
        print('modeling')
        
        self.model.add(LSTM(units, input_shape=(self.trainX.shape[1], self.trainX.shape[2]), return_sequences=True))
        self.model.add(Dropout(0.2))
        
        self.model.add(LSTM(units, return_sequences=True))
        self.model.add(Dropout(0.2))
        
        self.model.add(LSTM(units, return_sequences=True))
        self.model.add(Dropout(0.2))
        
        self.model.add(LSTM(units))
        self.model.add(Dropout(0.2))
        
        self.model.add(Dense(self.trainY.shape[1], activation='softmax'))

        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        
    def fit(self):
        print('fitting')
        self.model.fit(self.trainX, self.trainY, epochs=self.epochs, batch_size=self.batchSize)
        
    def generateText(self, length=400):
        mappedStr = self.starter
        print('mapped', mappedStr)
        fullStr = [self.nToKey[val] for val in mappedStr]
        print(fullStr)
        
        for i in range(length):
            x = np.reshape(mappedStr, (1, len(mappedStr), 1))
            x = x / float(len(self.chars))
            
            nextPred = np.argmax(self.model.predict(x, verbose=0))
            fullStr.append(self.nToKey[nextPred])
            mappedStr.append(nextPred)
            mappedStr = mappedStr[1:]
            
        print(fullStr)
        final = ""
        for char in fullStr:
            final += char
        print(final)
        
    def encodeStr(self, string):
        print('encoding')
        return [self.keyToN[char] for char in string]

In [13]:
text = open("data/emPosts.txt", encoding="utf-8-sig").read()
text = text.lower()

<h1>Model 05</h1>

- **LSTM Units:** 256
- **Epochs:** 1
- **Batch Size:** 100
- **Sequence Length:** 100
- +1 additional LSTM layer

In [14]:
units = 256
epochs = 1
batchSize = 100
sequenceLength = 100

Using a _quarter_ of data

In [15]:
print(len(text), 'characters')
print(len(text.split()), 'words')

# Portioning text for faster testsing
print('\ncutting data\n')
cut = int(len(text) / 4)
text = text[:cut]

print(len(text), 'characters')
print(len(text.split()), 'words')

486646 characters
86731 words

cutting data

121661 characters
21821 words


In [16]:
test05 = WriteForMe(text, epochs=epochs,
                    batchSize=batchSize,
                    seqLength=sequenceLength,
                    units=units)

mapping
preprocessing
default starter
modeling


In [17]:
test05.fit()

fitting
Epoch 1/1


Longest _observed_ ETA: ~ **1:30:00** per epoch

In [18]:
test05.generateText()

mapped [37, 37, 23, 22, 1, 20, 43, 1, 38, 26, 23, 31, 1, 41, 27, 38, 26, 9, 1, 38, 26, 23, 27, 36, 1, 41, 19, 36, 1, 41, 27, 38, 26, 1, 22, 27, 19, 31, 33, 32, 22, 1, 26, 19, 22, 1, 21, 23, 19, 37, 23, 22, 7, 1, 43, 23, 38, 1, 38, 39, 36, 31, 33, 27, 30, 1, 36, 23, 31, 19, 27, 32, 23, 22, 1, 41, 27, 38, 26, 27, 32, 1, 21, 30, 33, 40, 23, 36, 1, 41, 27, 38, 26, 1, 38, 26, 23, 1, 22, 23]
['s', 's', 'e', 'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', 'm', ' ', 'w', 'i', 't', 'h', '.', ' ', 't', 'h', 'e', 'i', 'r', ' ', 'w', 'a', 'r', ' ', 'w', 'i', 't', 'h', ' ', 'd', 'i', 'a', 'm', 'o', 'n', 'd', ' ', 'h', 'a', 'd', ' ', 'c', 'e', 'a', 's', 'e', 'd', ',', ' ', 'y', 'e', 't', ' ', 't', 'u', 'r', 'm', 'o', 'i', 'l', ' ', 'r', 'e', 'm', 'a', 'i', 'n', 'e', 'd', ' ', 'w', 'i', 't', 'h', 'i', 'n', ' ', 'c', 'l', 'o', 'v', 'e', 'r', ' ', 'w', 'i', 't', 'h', ' ', 't', 'h', 'e', ' ', 'd', 'e']
['s', 's', 'e', 'd', ' ', 'b', 'y', ' ', 't', 'h', 'e', 'm', ' ', 'w', 'i', 't', 'h', '.', ' ', 't', 'h', 'e',