<h1>RNN for Generating Text</h1>

In [1]:
# Dependencies

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
class WriteForMe:
    
    def __init__(self, text, epochs=1, batchSize=50, seqLength=10, units=400, starter=""):
        '''
        text: string of traning data
        mapType: whether to map characters or words
        
        '''
        self.batchSize = batchSize
        self.epochs = epochs
        self.text = text
        
        mapping = self.mapping()
        self.nToKey = mapping[0]
        self.keyToN = mapping[1]
        
        if len(starter) > 0:
            print('starter given')
            starter = self.encodeStr(starter)
            self.seqLen = len(starter)
            self.starter = starter
        else:
            self.seqLen = seqLength
        
        trainSet = self.preprocessing()
        self.trainX = trainSet[0]
        self.trainY = trainSet[1]
        if len(starter) < 1:
            print('default starter')
            self.starter = trainSet[2]
        
        self.model = Sequential()
        self.modeling(units=units)

    def mapping(self):
        print('mapping')
        
        characters = sorted(list(set(self.text)))
        self.chars = characters
        nToChar = { n:char for n, char in enumerate(characters) }
        charToN = { char:n for n, char in enumerate(characters) }
        return [nToChar, charToN]

    def preprocessing(self):
        print('preprocessing')
        
        x = []
        y = []
        length = len(self.text)
        for i in range(0, length-self.seqLen, 1):
            sequence = self.text[i:i + self.seqLen]
            label = self.text[i + self.seqLen]
            x.append([self.keyToN[char] for char in sequence])
            y.append(self.keyToN[label])
            
        xMod = np.reshape(x, (len(x), self.seqLen, 1))
        xMod = xMod / float(len(self.keyToN))
        yMod = np_utils.to_categorical(y)
        return [xMod, yMod, x[self.seqLen-1]]
    
    def modeling(self, units):
        print('modeling')
        
        self.model.add(LSTM(units, input_shape=(self.trainX.shape[1], self.trainX.shape[2]), return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units, return_sequences=True))
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(units))
        self.model.add(Dropout(0.2))
        self.model.add(Dense(self.trainY.shape[1], activation='softmax'))

        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        
    def fit(self):
        print('fitting')
        self.model.fit(self.trainX, self.trainY, epochs=self.epochs, batch_size=self.batchSize)
        
    def generateText(self, length=400):
        mappedStr = self.starter
        print('mapped', mappedStr)
        fullStr = [self.nToKey[val] for val in mappedStr]
        print(fullStr)
        
        for i in range(length):
            x = np.reshape(mappedStr, (1, len(mappedStr), 1))
            x = x / float(len(self.chars))
            
            nextPred = np.argmax(self.model.predict(x, verbose=0))
            fullStr.append(self.nToKey[nextPred])
            mappedStr.append(nextPred)
            mappedStr = mappedStr[1:]
            
        print(fullStr)
        final = ""
        for char in fullStr:
            final += char
        print(final)
        
    def encodeStr(self, string):
        print('encoding')
        return [self.keyToN[char] for char in string]

<h3>Data</h3>

In [3]:
text = open("data/emPosts.txt", encoding="utf-8-sig").read()
text = text.lower()
print(len(text), 'characters')

# Portioning text for faster testsing
cut = int(len(text) / 40)
print(cut, 'characters')
text = text[:cut]

486646 characters
12166 characters


<h3>Models</h3>

In [4]:
test01 = WriteForMe(text)

mapping
preprocessing
default starter
modeling


In [5]:
test01.fit()

fitting
Epoch 1/1


In [6]:
test01.generateText()

mapped [1, 13, 30, 30, 1, 12, 18, 16, 1, 9]
[' ', 'b', 's', 's', ' ', 'a', 'g', 'e', ' ', '2']
[' ', 'b', 's', 's', ' ', 'a', 'g', 'e', ' ', '2', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',