# Text generation

The goal of this project is to demonstrate text generation using LSTM neural networks.
Our database contains numerous movie plots taken from Wikipedia, so we will generate something similiar.

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.models import Sequential
import random 
import matplotlib.pyplot as plt
import warnings
import os, multiprocessing

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [3]:
# Computer characteristics

mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
mem_gib = mem_bytes / (1024.**3)
print("Computer characteristics: ")
print("RAM: %f GB" % mem_gib)
print("CORES: %d" % multiprocessing.cpu_count())


Computer characteristics: 
RAM: 7.653069 GB
CORES: 4


In [88]:
data = pd.read_csv("movie_plots.csv")

movie_plots = data['Plot']

def count_words(text):
    return len(text.split())

print("Max movie plot len: ", movie_plots.map(count_words).max())
print("Min movie plot len: ", movie_plots.map(count_words).min())


# zadrzavamo samo opise sa vise od 300 reci
movie_plots = movie_plots[movie_plots.map(count_words) > 299]
print("Min movie plot len: ", movie_plots.map(count_words).min())
# i zadrzavamo sve koji imaju manje od 500 reci
movie_plots = movie_plots[movie_plots.map(count_words) < 501]
print("Max movie plot len: ", movie_plots.map(len).max())

all_plots = list(movie_plots.values)
print(len(all_plots))
# setting a seed so we get the same result every time
random.seed(5)
sample = random.sample(all_plots, 100)
sample[6]

Max movie plot len:  6752
Min movie plot len:  2
Min movie plot len:  300
Max movie plot len:  3340
5948


'The story describes an encounter between a Parisian tailor named Maurice Courtelin (Chevalier) and a family of local aristocrats. These include Vicomte Gilbert de Varèze (Ruggles), who owes Maurice a large amount of money for tailoring work; Gilbert\'s uncle the Duc d\'Artelines (C. Aubrey Smith), the family patriarch; d\'Artelines\' man-hungry niece Valentine (Loy); and his other 22-year-old niece, Princesse Jeanette (MacDonald), who has been a widow for three years. D\'Artelines has been unable to find Jeanette a new husband of suitable age and rank. The household also includes three aunts and an ineffectual suitor the Comte de Savignac (Butterworth).\r\nMaurice custom-tailors clothing for de Varèze on credit, but the Vicomte\'s unpaid tailoring bills become intolerable, so Maurice travels to de Savignac\'s castle to collect the money owed to him. On the way, he has a confrontation with Princesse Jeanette. He immediately professes his love for her, but she haughtily rejects him.\r\n

## Tokenize words

Tokenization is turning unique words into unique integers. This step is necessary for preparing data for embedding layer.

In [85]:
from keras.preprocessing.text import Tokenizer

max_words = 10000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(sample)

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))


sequences = tokenizer.texts_to_sequences(sample)
sequences_len = 300
sequences = pad_sequences(sequences, maxlen = sequences_len, truncating = 'post')


sequences.shape
print(sequences[64])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [69]:
# making a single list of tokens so we can apply sliding windows

text = [item for sublist in sequences for item in sublist]
print("Corpus size: ", len(text))
vocab_size = len(tokenizer.word_index)
print("Vocabulary size: ", vocab_size+1)


Corpus size:  30000
Vocabulary size:  2551


In [29]:
# reverse dictionary so we can decode tokenized sequences back to words

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))


### Splitting the data for input and output values

Input sequence has the size of 20 words, and output is the next word

In [30]:
seq_len = 20

dataX = []
dataY = []

for i in range(len(text)-seq_len):
    seq_in = text[i:i+seq_len]
    seq_out = text[i+seq_len]
    dataX.append(seq_in)
    dataY.append(seq_out)
    
    
lenX = len(dataX)
print(lenX)

29980


In [10]:
from keras.utils import np_utils

dataX = np.asarray(dataX)
dataY = np_utils.to_categorical(dataY)

### Split into train and test set

In [12]:
from sklearn.model_selection import train_test_split


trainX, testX, trainy, testy = train_test_split(dataX, dataY, test_size = 0.2)
print(trainX.shape)
print(testX.shape)
print(trainy.shape)
print(testy.shape)

(15984, 20)
(3996, 20)
(15984, 7003)
(3996, 7003)


### Creating a model

In [13]:
import keras
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(Embedding(vocab_size+1, 32, input_length = trainX.shape[1]))
model.add(LSTM(100,  return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(trainy.shape[1], activation = 'softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            224096    
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           53200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 7003)              707303    
Total params: 1,064,999
Trainable params: 1,064,999
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])

filepath = "./weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', 
                             verbose = 1, save_best_only = True, mode = 'min')
callbacks = [checkpoint]

hist = model.fit(trainX, trainy, epochs = 20, batch_size = 128, 
                 verbose = 1, callbacks = callbacks, validation_split = 0.2)


Train on 15984 samples, validate on 3996 samples
Epoch 1/20

Epoch 00001: loss improved from inf to 6.77621, saving model to ./weights.hdf5
Epoch 2/20

Epoch 00002: loss improved from 6.77621 to 6.00778, saving model to ./weights.hdf5
Epoch 3/20

Epoch 00003: loss improved from 6.00778 to 5.93310, saving model to ./weights.hdf5
Epoch 4/20

Epoch 00004: loss improved from 5.93310 to 5.54941, saving model to ./weights.hdf5
Epoch 5/20

Epoch 00005: loss improved from 5.54941 to 5.36367, saving model to ./weights.hdf5
Epoch 6/20

Epoch 00006: loss improved from 5.36367 to 5.27126, saving model to ./weights.hdf5
Epoch 7/20

Epoch 00007: loss improved from 5.27126 to 5.21288, saving model to ./weights.hdf5
Epoch 8/20

Epoch 00008: loss improved from 5.21288 to 5.16821, saving model to ./weights.hdf5
Epoch 9/20

Epoch 00009: loss improved from 5.16821 to 5.12374, saving model to ./weights.hdf5
Epoch 10/20

Epoch 00010: loss improved from 5.12374 to 5.08701, saving model to ./weights.hdf5
Epoc

In [18]:
filename = "20e100n.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])


In [25]:
# Evaluate model on test set (accuracy and error)

print(model.metrics_names)
results = model.evaluate(testX, testy, batch_size = 128)
print('Loss: %.2f'% results[0])
print('Accuracy: %.2f'%(results[1]*100), "%")

['loss', 'acc']
Loss: 5.11
Accuracy: 25.98 %


### Plot za trening skup (preciznost)

In [22]:
plt.plot(hist.history["acc"])
plt.title('Model accuracy: ')
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.show()


'\nplt.plot(hist.history["acc"])\nplt.plot(hist.history["val_acc"])\nplt.title(\'Model accuracy: \')\nplt.legend([\'Train\', \'Validation\'], loc=\'upper right\')\nplt.ylabel("accuracy")\nplt.xlabel("epoch")\nplt.show()\n'

### Plot za trening i val skup (preciznost)

In [None]:
plt.plot(hist.history["acc"])
plt.plot(hist.history["val_acc"])
plt.title('Model accuracy: ')
plt.legend(['Train', 'Validation'], loc='best')
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.show()

### Plot za trening skup (greska)

In [None]:
plt.plot(hist.history["loss"])
plt.title('Model loss: ')
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()


### Plot za trening i val skup (greska)

In [None]:
plt.plot(hist.history["loss"])
plt.plot(hist.history["val_loss"])
plt.title('Model loss: ')
plt.legend(['Train', 'Validation'], loc='best')
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()

In [23]:
def generate_words(seed_text, num_words, model, max_seq_len = 20):
    for i in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen = max_seq_len, padding = 'pre')
        
        predicted = model.predict_classes(token_list, verbose = 0)
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [24]:
print(generate_words("The movie", 50, model))

The Movie And A Family And A Family And Is Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And Has And
