<h1>Custom Model</h1>

First test of the tutorial: https://www.analyticsvidhya.com/blog/2018/03/text-generation-using-python-nlp/

In [36]:
# Dependencies

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

<h3>Data</h3>

In [37]:
text = open("data/emPosts.txt", encoding="utf-8-sig").read()
text = text.lower()
print(len(text), 'characters')

# Portioning text for faster testsing
cut = int(len(text) / 40)
print(cut, 'characters')
text = text[:cut]

486646 characters
12166 characters


<h3>Mapping</h3>

In [38]:
# Mapping Characters

characters = sorted(list(set(text)))
nToChar = { n:char for n, char in enumerate(characters) }
charToN = { char:n for n, char in enumerate(characters) }

charToN

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 "'": 4,
 ',': 5,
 '-': 6,
 '.': 7,
 '0': 8,
 '2': 9,
 ':': 10,
 '?': 11,
 'a': 12,
 'b': 13,
 'c': 14,
 'd': 15,
 'e': 16,
 'f': 17,
 'g': 18,
 'h': 19,
 'i': 20,
 'j': 21,
 'k': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'q': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'x': 35,
 'y': 36,
 'z': 37,
 '~': 38,
 'â': 39,
 '’': 40,
 '“': 41,
 '”': 42,
 '€': 43}

In [41]:
# Mapping words (???)
words = text.split()
words = sorted(list(set(words)))
rawWords = []
for word in words:
    lettersOnly = ''.join(filter(str.isalpha, word))
    rawWords.append(lettersOnly)

rawWords = sorted(list(set(rawWords)))

nToWord = { n:word for n, word in enumerate(rawWords) }
wordToN = { word:n for n, word in enumerate(rawWords) }

wordToN

{'': 0,
 'a': 1,
 'about': 2,
 'above': 3,
 'accepting': 4,
 'accompanying': 5,
 'accusingly': 6,
 'achieving': 7,
 'acknowledgement': 8,
 'acquaintances': 9,
 'acted': 10,
 'adding': 11,
 'adept': 12,
 'adhered': 13,
 'admit': 14,
 'after': 15,
 'afternoon': 16,
 'against': 17,
 'age': 18,
 'agenda': 19,
 'aimed': 20,
 'air': 21,
 'all': 22,
 'almost': 23,
 'along': 24,
 'alove': 25,
 'already': 26,
 'also': 27,
 'altar': 28,
 'altars': 29,
 'although': 30,
 'always': 31,
 'amongst': 32,
 'an': 33,
 'and': 34,
 'animated': 35,
 'anomalies': 36,
 'anomaly': 37,
 'another': 38,
 'answer': 39,
 'answered': 40,
 'anymore': 41,
 'anyone': 42,
 'anything': 43,
 'anyways': 44,
 'apart': 45,
 'appearance': 46,
 'archives': 47,
 'are': 48,
 'arm': 49,
 'arms': 50,
 'aroma': 51,
 'arose': 52,
 'around': 53,
 'arrived': 54,
 'as': 55,
 'asariel': 56,
 'assumptions': 57,
 'at': 58,
 'atop': 59,
 'attachments': 60,
 'attack': 61,
 'attempt': 62,
 'attention': 63,
 'audible': 64,
 'aura': 65,
 'awa

<h3>Pre-processing</h3>

In [5]:
x = []
y = []
length = len(text)
seq_length = 100
for i in range(0, length-seq_length, 1):
    sequence = text[i:i + seq_length]
    label =text[i + seq_length]
    x.append([charToN[char] for char in sequence])
    y.append(charToN[label])

In [6]:
# pd.DataFrame(x)

In [7]:
xMod = np.reshape(x, (len(x), seq_length, 1))
xMod = xMod / float(len(characters))
yMod = np_utils.to_categorical(y)

<h3>Modeling</h3>

In [21]:
model = Sequential()
model.add(LSTM(700, input_shape=(xMod.shape[1], xMod.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700))
model.add(Dropout(0.2))
model.add(Dense(yMod.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
model.fit(xMod, yMod, epochs=1, batch_size=50)

Epoch 1/1
  200/12066 [..............................] - ETA: 13:07 - loss: 3.6623

KeyboardInterrupt: 

In [None]:
model.save_weights('text_generator_gigantic.h5')
model.load_weights('text_generator_gigantic.h5')

<h3>Text Generation</h3>

In [20]:
# This is here because we need somewhere to start before the model can start predicting the next words/characters
# So then wouldn't it make more sense if I tried to map the start by word instead of characters...
string_mapped = x[98]
full_string = [nToChar[value] for value in string_mapped]

xPred = np.reshape(string_mapped,(1,len(string_mapped), 1))
xPred = xPred / float(len(characters))

predict = model.predict(xPred)
print('original prediction', predict)
predict = np.argmax(predict)
print('np.argmax', predict)
seq = [nToChar[value] for value in string_mapped]
print('string_mapped', string_mapped)
print('character for each value in string_mapped', seq)
fullString = nToChar[predict]
print('character for prediction', fullString)

string_mapped = string_mapped[1:len(string_mapped)]
print('string mapped', string_mapped)

original prediction [[5.0222483e-03 1.9426273e-01 1.9882315e-04 2.1288788e-03 4.5147962e-03
  7.6511833e-03 7.8870071e-05 8.4973127e-03 9.7125150e-05 1.5485795e-04
  1.6255291e-04 1.2941032e-03 7.4337035e-02 9.0993810e-03 1.6253086e-02
  3.6185525e-02 1.1295947e-01 2.2133233e-02 2.3589419e-02 4.1377962e-02
  5.2495722e-02 5.5499724e-04 4.7752475e-03 2.7399700e-02 2.0823453e-02
  6.5756783e-02 4.5512524e-02 8.4797097e-03 1.2220471e-03 2.9433718e-02
  3.7698381e-02 8.3695613e-02 2.5924839e-02 4.6668779e-03 1.8472821e-02
  9.2570100e-04 9.2636365e-03 5.3003355e-04 1.6590480e-04 2.0995684e-04
  8.1789412e-04 2.3854841e-04 6.8820571e-04 2.4909154e-04]]
np.argmax 1
string_mapped [12, 30, 30, 16, 15, 1, 13, 36, 1, 31, 19, 16, 24, 1, 34, 20, 31, 19, 7, 1, 31, 19, 16, 20, 29, 1, 34, 12, 29, 1, 34, 20, 31, 19, 1, 15, 20, 12, 24, 26, 25, 15, 1, 19, 12, 15, 1, 14, 16, 12, 30, 16, 15, 5, 1, 36, 16, 31, 1, 31, 32, 29, 24, 26, 20, 23, 1, 29, 16, 24, 12, 20, 25, 16, 15, 1, 34, 20, 31, 19, 20, 25, 1, 1

In [17]:
string_mapped = x[99]
full_string = [nToChar[value] for value in string_mapped]

# generating characters
for i in range(400):
    xPred = np.reshape(string_mapped,(1,len(string_mapped), 1))
    xPred = xPred / float(len(characters))

    pred_index = np.argmax(model.predict(xPred, verbose=0))
    print(pred_index)
    seq = [nToChar[value] for value in string_mapped]
    full_string.append(nToChar[pred_index])

    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

ValueError: Error when checking input: expected lstm_1_input to have shape (100, 1) but got array with shape (101, 1)

In [None]:
#combining text
txt=""
for char in full_string:
    txt = txt+char
txt