# AI Essay Writer

This notebook follows closely the notebook developed by **Jeremy Chow** here:

https://github.com/jeremyrchow/text-generation-kaggle

## Imports
Here the libraries we need for deep learning.

In [None]:
# Standard Data Science Libraries
import pickle
import math
import pandas as pd
import numpy as np
from numpy import array

# Neural Net Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Neural Net Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# Neural Net Training
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

from pickle import load

# The function below is from
# https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences

# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|et al)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

# Load in the text file

In [None]:
# Load text

filename = 'Poe.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# ... and split it into sentences.
sentences = split_into_sentences(text)

# Let's look at the first sentence,
print("First sentence:\n"+sentences[0]+"\n")

# ... and the last sentence,
print("Last sentence:\n"+sentences[-1])

### Tokenize words in corpus using Keras Tokenizer.
This function does the following:
1. Removes punctation
2. Sets all text to lower case
3. Splits the words up, then assigns a unique integer to each word
4. Replaces all instances of that word with the integer.

Tokenization is necessary for preparing data for embedding layer (see model architecture section below)

In [None]:
max_words = 50000 # Max size of the dictionary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
# Here's the first sentence,
print("Here's the first sentence:\n")
print(sentences[0])

# It becomes a list of numbers,
print("\nIt becomes a list of numbers:\n")
print(sequences[0])

# And here's the dictionary that linkss each number to a word,
tokenizer.word_index

In [None]:
# Flatten the list of lists resulting from the tokenization. This will reduce the list
# to one dimension, allowing us to apply the sliding window technique to predict the next word

text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)
print('Vocabulary size in this corpus: ', vocab_size)

In [None]:
# Training on 19 words to predict the 20th
sentence_len = 20
pred_len     = 1
train_len    = sentence_len - pred_len
seq = []

# Sliding window to generate train data
for i in range(len(text)-sentence_len):
    seq.append(text[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [None]:
# Each row in seq is a 20 word long window. We append he first 19 words as the input to predict the 20th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

# Build the AI

In [None]:
# define model
model = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(150, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# And teach it

In [None]:
# Early stopping allows model to stop training if improvement stops.
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
filepath = "./model_weights.hdf5"
# Model checkpointing allows us to preserve progress during training if training is interrupted
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 300,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

In [None]:
# Save the model weights, because this takes a long time.
model.save('model_weights.hdf5')

In [None]:
# Loaded after being trained
model.load_weights('model_weights.hdf5')

# Testing the generation models

In [None]:
def gen(model,seq,max_len = 20):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

model_list = [model]
def test_models(test_string,sequence_length= 50, model_list = model_list):
    '''Generates output given input test_string up to sequence_length'''
    print('Input String: ', test_string)
    for counter,model in enumerate(model_list):
        print("Model ", counter+1, ":")
        print(gen(model,test_string,sequence_length))
    pass

In [None]:
test_models('and then there was not', 100)