# Text generation

The goal of this project is to demonstrate text generation using LSTM neural networks.
Our database contains numerous movie plots taken from Wikipedia, so we will generate something similiar.

In [29]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.models import Sequential
import random 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [38]:
data = pd.read_csv("movie_plots.csv")

movie_plots = data['Plot']
np.random.seed(1)
print("Max movie plot len: ", movie_plots.map(len).max())
print("Min movie plot len: ", movie_plots.map(len).min())

all_plots = list(movie_plots.values)
sample = random.sample(all_plots, 50)


Max movie plot len:  36773
Min movie plot len:  15


## Tokenize words

Generally in Natural Language Processing projects, the first step is removal of stop words, such as "the", "a", "an", and punctuation. We will skip this step since we want to generate human-like speech.
Tokenization is turning unique words into unique integers. This step is necessary for preparing data for embedding layer.

In [39]:
max_words = 50000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(sample)

sequences = tokenizer.texts_to_sequences(sample)
sequences = pad_sequences(sequences, maxlen = 80, truncating = 'post')
sequences.shape
print(sequences[:2])

[[ 673    3  432 1855 1856 1857    3 1858 1859   31    4  142    3  129
  1165 1860  433   25    1 1861    5   27  851   29 1862    4   42  238
   432  163  852 1863 1864 1166 1865 1167 1166 1866   19  853   10  432
   674   74   38 1168  675  432   90    2  208    1 1867    4  524   23
   673   20   30   11  164  108  432 1169    4  434   16 1868   42  238
    23   18    1  854 1869   38   14   27  165  268]
 [ 189   31  437 1173    3   35  122   53   31  317  165  130   20    1
   122  239    2 1174  270   35    5   34   17    4  438  268   19   83
   109    9   39  680   96    9 1886  271   46    4  240   12    9  142
     3  212   18    1  122  527    2    9  142    3  528   16   80   11
  1887    9    2    4  272   19   83   80    9  360 1888    1  272    7
   110   35   20  143   68    6 1175   11  529   13]]


In [40]:
# making a single list of tokens so we can apply sliding windows

text = [item for sublist in sequences for item in sublist]
print(len(text))
vocab_size = len(tokenizer.word_index)

4000


In [45]:
print("Vocabulary size: ", vocab_size+1)

# reverse dictionary so we can decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))


Vocabulary size:  4316


### Splitting the data for input and output values

In [42]:
seq_len = 20

dataX = []
dataY = []

for i in range(len(text)-seq_len):
    seq_in = text[i:i+seq_len]
    seq_out = text[i+seq_len]
    dataX.append(seq_in)
    dataY.append(seq_out)
    
size = len(dataX)
print(size)

3980


In [43]:
dataX = np.asarray(dataX)
dataY = np.asarray(dataY)
dataX.shape, dataY.shape

((3980, 20), (3980,))

In [44]:
from keras.utils import np_utils

#trainX = np.reshape(dataX, (size, seq_len, 1))
trainX = dataX
trainy = np_utils.to_categorical(dataY)
trainy.shape

(3980, 4316)

### Creating a model

In [48]:
import keras
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(Embedding(vocab_size+1, 10, input_length = trainX.shape[1]))
model.add(LSTM(256))
model.add(Dropout(0.1))
model.add(Dense(trainy.shape[1], activation = 'softmax'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 10)            43160     
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               273408    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 4316)              1109212   
Total params: 1,425,780
Trainable params: 1,425,780
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])

filepath = "./weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode = 'min')
callbacks = [checkpoint]

hist = model.fit(trainX, trainy, epochs = 20, batch_size = 128, verbose = 1, callbacks = callbacks)

Epoch 1/20

Epoch 00001: loss improved from inf to 7.58594, saving model to ./weights.hdf5
Epoch 2/20

Epoch 00002: loss improved from 7.58594 to 6.50208, saving model to ./weights.hdf5
Epoch 3/20

Epoch 00003: loss improved from 6.50208 to 6.30876, saving model to ./weights.hdf5
Epoch 4/20

Epoch 00004: loss improved from 6.30876 to 6.25448, saving model to ./weights.hdf5
Epoch 5/20

Epoch 00005: loss improved from 6.25448 to 6.22545, saving model to ./weights.hdf5
Epoch 6/20

Epoch 00006: loss improved from 6.22545 to 6.18402, saving model to ./weights.hdf5
Epoch 7/20

Epoch 00007: loss improved from 6.18402 to 6.13193, saving model to ./weights.hdf5
Epoch 8/20

Epoch 00008: loss improved from 6.13193 to 6.07638, saving model to ./weights.hdf5
Epoch 9/20

Epoch 00009: loss improved from 6.07638 to 6.01648, saving model to ./weights.hdf5
Epoch 10/20

Epoch 00010: loss improved from 6.01648 to 5.95496, saving model to ./weights.hdf5
Epoch 11/20

Epoch 00011: loss improved from 5.95496 

In [50]:
# Loading weights from a checkpoint

filename = "weights.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')


In [51]:
def generate_words(seed_text, num_words, model, max_seq_len = 20):
    for i in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen = max_seq_len, padding = 'pre')
        
        predicted = model.predict_classes(token_list, verbose = 0)
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [52]:
print(generate_words("the man in the black", 10, model))

The Man In The Black And A A Is And A A Is And A
