# Text generation

The goal of this project is to demonstrate text generation using LSTM neural networks.
Our database contains numerous movie plots taken from Wikipedia, so we will generate something similiar.

In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.models import Sequential

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

#from tensorflow import set_random_seed
#from numpy.random import seed

#set_random_seed(2)
#seed(1)


In [6]:
data = pd.read_csv("movie_plots.csv")
data = data[data['Plot'].isnull()==False]
movie_plots = data['Plot']
print("Number of plots: ", movie_plots.shape[0])
movie_plots = movie_plots[:20]

Number of plots:  34886


## Tokenize words

Generally in Natural Language Processing projects, the first step is removal of stop words, such as "the", "a", "an", and punctuation. We will skip this step since we want to generate human-like speech.
Tokenization is turning unique words into unique integers. This step is necessary for preparing data for embedding layer.

In [7]:
max_words = 50000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(movie_plots.values)

sequences = tokenizer.texts_to_sequences(movie_plots.values)
sequences = pad_sequences(sequences, maxlen = 80, truncating = 'post')
sequences.shape

(20, 80)

In [8]:
# making a single list of tokens so we can apply sliding windows

text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)

In [9]:
print("Vocabulary size: ", vocab_size)

# reverse dictionary so we can decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

Vocabulary size:  985


### Splitting the data for input and output values

In [10]:
seq_len = 20

dataX = []
dataY = []

for i in range(len(text)-seq_len):
    seq_in = text[i:i+seq_len]
    seq_out = text[i+seq_len]
    dataX.append(seq_in)
    dataY.append(seq_out)
    
size = len(dataX)
print(size)

1580


In [11]:
dataX = np.asarray(dataX)
dataY = np.asarray(dataY)
dataX.shape, dataY.shape

((1580, 20), (1580,))

In [12]:
from keras.utils import np_utils

trainX = np.reshape(dataX, (size, seq_len, 1))
trainy = np_utils.to_categorical(dataY)
trainy.shape

(1580, 986)

### Creating a model

In [13]:
import keras
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

model = Sequential()
#model.add(Embedding(vocab_size+1, 50, input_length = train_len))
model.add(LSTM(256, input_shape = (trainX.shape[1], trainX.shape[2])))
#model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(trainy.shape[1], activation = 'softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 986)               253402    
Total params: 517,594
Trainable params: 517,594
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])

filepath = "./weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode = 'min')
callbacks = [checkpoint]

hist = model.fit(trainX, trainy, epochs = 20, batch_size = 128, verbose = 1, callbacks = callbacks)

Epoch 1/20

Epoch 00001: loss improved from inf to 6.47124, saving model to ./weights.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.47124 to 5.27591, saving model to ./weights.hdf5
Epoch 3/20

Epoch 00003: loss improved from 5.27591 to 4.99426, saving model to ./weights.hdf5
Epoch 4/20

Epoch 00004: loss improved from 4.99426 to 4.86287, saving model to ./weights.hdf5
Epoch 5/20

Epoch 00005: loss improved from 4.86287 to 4.76416, saving model to ./weights.hdf5
Epoch 6/20

Epoch 00006: loss improved from 4.76416 to 4.69570, saving model to ./weights.hdf5
Epoch 7/20

Epoch 00007: loss improved from 4.69570 to 4.62751, saving model to ./weights.hdf5
Epoch 8/20

Epoch 00008: loss improved from 4.62751 to 4.56267, saving model to ./weights.hdf5
Epoch 9/20

Epoch 00009: loss improved from 4.56267 to 4.49021, saving model to ./weights.hdf5
Epoch 10/20

Epoch 00010: loss improved from 4.49021 to 4.43504, saving model to ./weights.hdf5
Epoch 11/20

Epoch 00011: loss improved from 4.43504 

In [51]:
# Loading weights from a checkpoint

filename = "weights.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
