# AI Essay Writer

This notebook follows closely the notebook developed by **Jeremy Chow** here:

https://github.com/jeremyrchow/text-generation-kaggle

## Imports
Here the libraries we need for deep learning.

In [1]:
# Standard Data Science Libraries
import pickle
import math
import pandas as pd
import numpy as np
from numpy import array

# Neural Net Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Neural Net Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# Neural Net Training
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

from pickle import load

# The function below is from
# https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences

# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|et al)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using Theano backend.


# Load in the text file

In [2]:
# Load text

filename = 'Cat_and_Mark_papers.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# ... and split it into sentences.
sentences = split_into_sentences(text)

# Let's look at the first sentence,
print(sentences[0])

# ... and the last sentence,
print(sentences[-1])

Childhood epilepsy with centrotemporal spikes (CECTS, previously benign epilepsy with centrotemporal spikes, BECTS) is the most common idiopathic focal epilepsy, accounting for approximately 10% of all childhood onset epilepsies (Astradsson et al., 1998; Berg and Rychlik, 2015; Callenbach et al., 2010; Camfield and Camfield, 2014; Larsson and Eeg-Olofsson, 2006).
This preliminary result suggests a disruption in the thalamocortical circuit in BECTS.


# Preprocessing
Generally for NLP projects, to optimize the model's ability to gather meaning from the text, there would be removal of:
- stop words such as _"the","a","an"_ 
- punctuation

then tokenization (turning unique words into unique integers) of the text. However, because the goal here is to generate fluid and human-like speech, we want to preserve stop words. Instead we just use the Tokenizer method in the Keras library to perform the rest of the preprocessing steps.

### Tokenize words in corpus using Keras Tokenizer.
This function does the following:
1. Removes punctation
2. Sets all text to lower case
3. Splits the words up, then assigns a unique integer to each word
4. Replaces all instances of that word with the integer.

Tokenization is necessary for preparing data for embedding layer (see model architecture section below)

In [3]:
max_words = 50000 # Max size of the dictionary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [4]:
# Here's the first sentence,
print("Here's the first sentence:")
print(sentences[0])

# It becomes a list of numbers,
print("It becomes a list of numbers:")
print(sequences[0])

# And here's the dictionary that linkss each number to a word,
tokenizer.word_index

Here's the first sentence:
Childhood epilepsy with centrotemporal spikes (CECTS, previously benign epilepsy with centrotemporal spikes, BECTS) is the most common idiopathic focal epilepsy, accounting for approximately 10% of all childhood onset epilepsies (Astradsson et al., 1998; Berg and Rychlik, 2015; Callenbach et al., 2010; Camfield and Camfield, 2014; Larsson and Eeg-Olofsson, 2006).
It becomes a list of numbers:
[178, 16, 9, 297, 22, 83, 560, 484, 16, 9, 297, 22, 17, 18, 1, 165, 158, 773, 34, 16, 1201, 15, 646, 132, 6, 133, 178, 202, 774, 961, 2, 3, 298, 427, 4, 1202, 90, 647, 2, 3, 69, 428, 4, 428, 41, 648, 4, 35, 962, 115]


{'the': 1,
 'et': 2,
 'al': 3,
 'and': 4,
 'in': 5,
 'of': 6,
 'to': 7,
 'a': 8,
 'with': 9,
 'we': 10,
 'that': 11,
 'spike': 12,
 'ripples': 13,
 'will': 14,
 'for': 15,
 'epilepsy': 16,
 'bects': 17,
 'is': 18,
 'this': 19,
 'are': 20,
 'ripple': 21,
 'spikes': 22,
 'these': 23,
 'have': 24,
 '2019': 25,
 'by': 26,
 'from': 27,
 'stimulation': 28,
 'children': 29,
 'on': 30,
 'sleep': 31,
 'as': 32,
 '2018': 33,
 'focal': 34,
 'eeg': 35,
 'data': 36,
 'spindles': 37,
 'or': 38,
 '2017': 39,
 'our': 40,
 '2014': 41,
 'spindle': 42,
 'automated': 43,
 'detector': 44,
 'white': 45,
 'matter': 46,
 'cognitive': 47,
 'be': 48,
 '2016': 49,
 'between': 50,
 'rate': 51,
 '1': 52,
 'an': 53,
 '2008': 54,
 '2011': 55,
 'thalamocortical': 56,
 '2': 57,
 'been': 58,
 '2012': 59,
 'not': 60,
 'patients': 61,
 'seizures': 62,
 'observed': 63,
 'may': 64,
 'activity': 65,
 'model': 66,
 '2013': 67,
 'models': 68,
 '2010': 69,
 'during': 70,
 'motor': 71,
 'cortical': 72,
 'clinical': 73,
 'abnorm

In [5]:
# Flatten the list of lists resulting from the tokenization. This will reduce the list
# to one dimension, allowing us to apply the sliding window technique to predict the next word
text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)
print('Vocabulary size in this corpus: ', vocab_size)

Vocabulary size in this corpus:  2899


In [6]:
# Training on 19 words to predict the 20th
sentence_len = 20
pred_len     = 1
train_len    = sentence_len - pred_len
seq = []

# Sliding window to generate train data
for i in range(len(text)-sentence_len):
    seq.append(text[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [7]:
# Each row in seq is a 20 word long window. We append he first 19 words as the input to predict the 20th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

# Model Architecture:
1. Embedding layer
    - Helps model understand 'meaning' of words by mapping them to representative vector space instead of semantic integers
2. Stacked LSTM layers
    - Stacked LSTMs add more depth than additional cells in a single LSTM layer (see paper: https://arxiv.org/abs/1303.5778)
    - The first LSTM layer must have `return sequences` flag set to True in order to pass sequence information to the second LSTM layer instead of just its end states
3. Dense (regression) layer with ReLU activation
4. Dense layer with Softmax activation 
    - Outputs word probability across entire vocab

# Build the AI

In [8]:
# define model
model = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(150, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

Instructions for updating:
Colocations handled automatically by placer.


# And teach it

In [9]:
# Early stopping allows model to stop training if improvement stops.
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
filepath = "./model_weights.hdf5"
# Model checkpointing allows us to preserve progress during training if training is interrupted
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 300,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/300
Epoch 00001: loss improved from inf to 6.76415, saving model to ./model_weights.hdf5
Epoch 2/300
Epoch 00002: loss improved from 6.76415 to 6.31425, saving model to ./model_weights.hdf5
Epoch 3/300
Epoch 00003: loss improved from 6.31425 to 6.03820, saving model to ./model_weights.hdf5
Epoch 4/300
Epoch 00004: loss improved from 6.03820 to 5.77745, saving model to ./model_weights.hdf5
Epoch 5/300
Epoch 00005: loss improved from 5.77745 to 5.58747, saving model to ./model_weights.hdf5
Epoch 6/300
Epoch 00006: loss improved from 5.58747 to 5.45468, saving model to ./model_weights.hdf5
Epoch 7/300
Epoch 00007: loss improved from 5.45468 to 5.35131, saving model to ./model_weights.hdf5
Epoch 8/300
Epoch 00008: loss improved from 5.35131 to 5.25666, saving model to ./model_weights.hdf5
Epoch 9/300
Epoch 00009: loss improved from 5.25666 to 5.17112, saving model to ./model_weights.hdf5
Epoch 10/300
Epoch 00010: loss improved from 5.

Model 3 was trained for 300 epochs and reached .63 accuracy

In [9]:
# Loaded after being trained
model.load_weights('model_weights.hdf5')

# Testing the generation models
If this were any other type of project then a good metric to quantify the model's success would be to do a **train-test split to identify the testing accuracy score** using the models to predict data it was not trained on and had never seen before. 

However, the goal of text generation isn't quite to maximize accuracy, because that would amount to the model regurgitating quotes and would be overfitting. Instead we'll compare the model outputs to the same input strings.

In [10]:
def gen(model,seq,max_len = 20):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [11]:
model_list = [model]
def test_models(test_string,sequence_length= 50, model_list = model_list):
    '''Generates output given input test_string up to sequence_length'''
    print('Input String: ', test_string)
    for counter,model in enumerate(model_list):
        print("Model ", counter+1, ":")
        print(gen(model,test_string,sequence_length))
    pass

In [18]:
test_models('', 100)

Input String:  
Model  1 :
is a well characterized pathophysiological mechanism in seizures and proven treatment strategies to address them values 10 hz mice the whole brain and dopaminergic tone by the same datasets that relies on subjective in their here to neurostimulation and fractional stimuli can be 50 150 ms concurrent with electrical stimulation in children with bects this architecture features suggesting with a pediatric receptor suppressed m current in the axons functional tissue is a high positive validation from spindles this detection approach that analyze candidate parameters in humans epilepsy is a common childhood stimulation parameters identified we will determine whether stimulation impacts
