# Project submission: DEEP LEARNING_TV Script Generation
# Student: Ginju Soumya

# TV Script Generation
In this project, you'll generate your own Simpsons (https://en.wikipedia.org/wiki/The_Simpsons) TV scripts using RNNs. You'll be using part of the Simpsons dataset (https://www.kaggle.com/wcukierski/the-simpsonsby- the-data) of scripts from 27 seasons. The Neural Network you'll build will generate a new TV script for a scene at Moe's Tavern (https://simpsonswiki.com/wiki/Moe's_Tavern).

Get the Data
The data is already provided for you. You'll be using a subset of the original dataset. It consists of only the scenes in Moe's Tavern. This doesn't include other versions of the tavern, like "Moe's Cavern", "Flaming Moe's", "Uncle Moe's Family Feed-Bag", etc..


In [1]:
import os
import pickle

def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
    return data


def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """
    text = load_data(dataset_path)
# Ignore notice, since we don't use it for analysing the data
    text = text[81:]
    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))
    text = text.lower()
    text = text.split()
    vocab_to_int, int_to_vocab = create_lookup_tables(text)
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

    
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))

def save_params(params):
    """
    Save parameters to file
    """
    pickle.dump(params, open('params.p', 'wb'))

def load_params():
    """
    Load parameters from file
    """
    return pickle.load(open('params.p', mode='rb'))

In [2]:
# Load data
data_dir = 'moes_tavern_lines.txt'
text = load_data(data_dir)
# Ignore notice, since we don't use it for analysing the data
text = text[81:]

In [3]:
text[:200]

"\nMoe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.\nBart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.\nMoe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mik"

In [4]:
view_sentence_range = (0, 10)
import numpy as np
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.251908396946565
Number of lines: 4258
Average number of words in each line: 11.50164396430249

The sentences 0 to 10:

Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.



In [5]:
import numpy as np

from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    word_counts = Counter(text)
    vocab = set(text)
    vocab_to_int = {c: i for i, c in enumerate(vocab)}
    int_to_vocab = dict(enumerate(vocab))
    return vocab_to_int, int_to_vocab

In [6]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenize dictionary where the key is the punctuation and the value is the token
    """
    
    punc = {'.': '||Period||',
            ',': '||Comma||',
            '"': '||Quotation_Mark||',
            ';': '||Semicolon||',
            '!': '||Exclamation_mark||',
            '?': '||Question_mark||',
            '(': '||Left_Parentheses||',
            ')': '||Right_Parentheses||',
            '--': '||Dash||',
            '\n': '||Return||'}
    
    return punc

In [7]:
preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [8]:
import numpy as np
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

In [9]:
def get_inputs():
    """
    Create TF Placeholders for input, targets, and learning rate.
    :return: Tuple (input, targets, learning rate)
    """
    input = tf.placeholder(tf.int32, [None, None] , name='input')
    targets = tf.placeholder(tf.int32, [None, None])
    learningRate = tf.placeholder(tf.float32)
    return (input, targets, learningRate)

In [10]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.3'), 'Please use TensorFlow version 1.3 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.13.1


  


In [11]:
def get_init_cell(batch_size, rnn_size):
    """
    Create an RNN Cell and initialize it.
    :param batch_size: Size of batches
    :param rnn_size: Size of RNNs
    :return: Tuple (cell, initialize state)
    """
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm])
    initial_state = tf.identity(cell.zero_state(batch_size,tf.float32), name = 'initial_state')
    
    return (cell, initial_state)

In [12]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Create embedding for <input_data>.
    :param input_data: TF placeholder for text input.
    :param vocab_size: Number of words in vocabulary.
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input.
    """
    em = tf.Variable(tf.random_uniform([vocab_size, embed_dim], -1.0, 1.0))
    embeddings = tf.nn.embedding_lookup(em, input_data)
    
    return embeddings

In [13]:
def build_rnn(cell, inputs):
    """
    Create a RNN using a RNN Cell
    :param cell: RNN Cell
    :param inputs: Input text data
    :return: Tuple (Outputs, Final State)
    """
    outputs, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(state, name="final_state")
    
    return (outputs, final_state)

In [14]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the neural network
    :param cell: RNN cell
    :param rnn_size: Size of rnns
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :return: Tuple (Logits, FinalState)
    """
    #embed_dim = 100
    embeddings = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embeddings)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)

    return (logits,final_state)

In [15]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """
    total_batch = len(int_text)//(batch_size * seq_length)
    len_to_consider = int(total_batch*batch_size*seq_length)
    
    input_text = np.array(int_text[:len_to_consider])
    label_text = np.array(int_text[1:len_to_consider+1])
    
    input_text = np.split(input_text, total_batch*batch_size)
    label_text = np.split(label_text, total_batch*batch_size)
    
    output = np.empty((total_batch, 2, batch_size, seq_length))
    
    for i in range(batch_size):
        for j in range(total_batch):
            output[j][0][i] = input_text[total_batch*(i)+j]
    for i in range(batch_size):
        for j in range(total_batch):
            output[j][1][i] = label_text[total_batch*(i)+j]
    return output

In [16]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their ids
    :param batch_size: The size of batch
    :param seq_length: The length of sequence
    :return: Batches as a Numpy array
    """   
    inputs_per_batch = batch_size * seq_length
    num_batches = len(int_text)//(inputs_per_batch)
    int_text = int_text[:num_batches*inputs_per_batch] # drop unused  
    int_text.append(int_text[0]) # to use first input value of first batch as last target value of the last batch

    # allocate memory with shape of batches
    batches = np.zeros([num_batches, 2, batch_size, seq_length], dtype=np.int32)

    # Add seq_length elements at a time to input and targets appropriately
    for i in range(0, len(int_text), seq_length):
        batch_no = (i // seq_length) % num_batches
        index_in_batch = i // (seq_length * num_batches)
        
        if (index_in_batch == batch_size):            
            break
        
        # input 
        batches[batch_no, 0, index_in_batch] = int_text[i : i+seq_length]

        # targets
        batches[batch_no, 1, index_in_batch] = int_text[i+1 : i+seq_length+1] # element next to input element
    return batches

In [17]:
# Number of Epochs
num_epochs = 300
# Batch Size
batch_size =128
# RNN Size
rnn_size = 256
# Embedding Dimension Size
embed_dim = 256
# Sequence Length
seq_length = 32
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 20

save_dir = './save'

In [31]:
#pip install tflearn

Collecting tflearn
  Downloading https://files.pythonhosted.org/packages/16/ec/e9ce1b52e71f6dff3bd944f020cef7140779e783ab27512ea7c7275ddee5/tflearn-0.3.2.tar.gz (98kB)
Building wheels for collected packages: tflearn
  Building wheel for tflearn (setup.py): started
  Building wheel for tflearn (setup.py): finished with status 'done'
  Created wheel for tflearn: filename=tflearn-0.3.2-cp37-none-any.whl size=128213 sha256=57cd91487078cfac9c3d51ceec4f1bb0ed8f088d1c8fcf3ec9f8663fc7824c36
  Stored in directory: C:\Users\Redhawk\AppData\Local\pip\Cache\wheels\d0\f6\69\0ef3ee395aac2e5d15d89efd29a9a216f3c27767b43b72c006
Successfully built tflearn
Installing collected packages: tflearn
Successfully installed tflearn-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [37]:
#import tensorflow

In [67]:
pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading https://files.pythonhosted.org/packages/67/9e/352fd3a8eb98815174fad722443ac183c3d58cf9bd63af7838d0ceb021ef/tensorflow_addons-0.8.2-cp37-cp37m-win_amd64.whl (837kB)
Collecting typeguard (from tensorflow_addons)
  Downloading https://files.pythonhosted.org/packages/06/37/d236aec27f8a8eed66f1a17116eb51684528cf8005a6883f879fe2e842ae/typeguard-2.7.1-py3-none-any.whl
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.8.2 typeguard-2.7.1
Note: you may need to restart the kernel to use updated packages.


In [120]:
import tensorflow as tf

In [134]:
pip install tensorflow==1.13.1

Collecting tensorflow==1.13.1
  Downloading https://files.pythonhosted.org/packages/7b/14/e4538c2bc3ae9f4ce6f6ce7ef1180da05abc4a617afba798268232b01d0d/tensorflow-1.13.1-cp37-cp37m-win_amd64.whl (63.1MB)
Installing collected packages: tensorflow
  Found existing installation: tensorflow 1.13.2
    Uninstalling tensorflow-1.13.2:
      Successfully uninstalled tensorflow-1.13.2
Successfully installed tensorflow-1.13.1
Note: you may need to restart the kernel to use updated packages.


In [18]:
from tensorflow.contrib import seq2seq


train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')
    
    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))
    
    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)
    
    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var)
                        for grad, var in gradients]
    train_op = optimizer.apply_gradients(capped_gradients)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


In [19]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))
    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/16 train_loss = 8.824
Epoch   1 Batch    4/16 train_loss = 5.384
Epoch   2 Batch    8/16 train_loss = 4.721
Epoch   3 Batch   12/16 train_loss = 4.185
Epoch   5 Batch    0/16 train_loss = 3.745
Epoch   6 Batch    4/16 train_loss = 3.444
Epoch   7 Batch    8/16 train_loss = 3.129
Epoch   8 Batch   12/16 train_loss = 2.860
Epoch  10 Batch    0/16 train_loss = 2.615
Epoch  11 Batch    4/16 train_loss = 2.417
Epoch  12 Batch    8/16 train_loss = 2.236
Epoch  13 Batch   12/16 train_loss = 2.099
Epoch  15 Batch    0/16 train_loss = 1.917
Epoch  16 Batch    4/16 train_loss = 1.786
Epoch  17 Batch    8/16 train_loss = 1.726
Epoch  18 Batch   12/16 train_loss = 1.688
Epoch  20 Batch    0/16 train_loss = 1.569
Epoch  21 Batch    4/16 train_loss = 1.448
Epoch  22 Batch    8/16 train_loss = 1.305
Epoch  23 Batch   12/16 train_loss = 1.212
Epoch  25 Batch    0/16 train_loss = 1.084
Epoch  26 Batch    4/16 train_loss = 1.000
Epoch  27 Batch    8/16 train_loss = 0.918
Epoch  28 B

Epoch 238 Batch   12/16 train_loss = 0.097
Epoch 240 Batch    0/16 train_loss = 0.095
Epoch 241 Batch    4/16 train_loss = 0.088
Epoch 242 Batch    8/16 train_loss = 0.082
Epoch 243 Batch   12/16 train_loss = 0.096
Epoch 245 Batch    0/16 train_loss = 0.093
Epoch 246 Batch    4/16 train_loss = 0.086
Epoch 247 Batch    8/16 train_loss = 0.081
Epoch 248 Batch   12/16 train_loss = 0.096
Epoch 250 Batch    0/16 train_loss = 0.093
Epoch 251 Batch    4/16 train_loss = 0.086
Epoch 252 Batch    8/16 train_loss = 0.081
Epoch 253 Batch   12/16 train_loss = 0.095
Epoch 255 Batch    0/16 train_loss = 0.092
Epoch 256 Batch    4/16 train_loss = 0.085
Epoch 257 Batch    8/16 train_loss = 0.080
Epoch 258 Batch   12/16 train_loss = 0.094
Epoch 260 Batch    0/16 train_loss = 0.092
Epoch 261 Batch    4/16 train_loss = 0.085
Epoch 262 Batch    8/16 train_loss = 0.080
Epoch 263 Batch   12/16 train_loss = 0.094
Epoch 265 Batch    0/16 train_loss = 0.091
Epoch 266 Batch    4/16 train_loss = 0.084
Epoch 267 B

In [20]:
#save_params((seq_length, save_dir))
import helper
helper.save_params((seq_length, save_dir))

In [21]:
#_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()
#seq_length, load_dir = load_params()

In [22]:
import tensorflow as tf
import numpy as np
import helper

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

In [23]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    inputTensor = loaded_graph.get_tensor_by_name("input:0")
    initialStateTensor = loaded_graph.get_tensor_by_name("initial_state:0")
    finalStateTensor = loaded_graph.get_tensor_by_name("final_state:0")
    probsTensor = loaded_graph.get_tensor_by_name("probs:0")
    return (inputTensor, initialStateTensor, finalStateTensor, probsTensor)

In [24]:
import random
#seed = 0
def pick_word(probabilities, int_to_vocab):
#    return int_to_vocab[int(np.searchsorted(np.cumsum(probabilities), np.random.rand()))]
     return int_to_vocab[np.argmax(probabilities)]

In [25]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'homer_simpson'

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./save
homer_simpson:(awkward chuckle) oopsie.
edna_krabappel-flanders:(" why not?") want it to get weirder?
edna_krabappel-flanders:(sweetly) good call, bart. we can always play it pizzicato.
lenny_leonard: the buyer clearly specified violin in the world rather.
moe_szyslak:(chuckles) i was in bartending school, i thought i had the world by the jigger.
moe_szyslak:(quietly) i wouldn't drink homer?
marge_simpson:...
lenny_leonard: sure beats a minute of silent prayer for any.
carl_carlson: hey, i'm worried.
homer_simpson: i've had just about enough of you.(proudly) i am.
carl_carlson: oh, right! listen, homer, somethin' weird happened back in high school...
homer_simpson:(intrigued) so there's a big girl!
lisa_simpson: i'm a big girl! i'm the moe, you're here to go to get you to look, when i look like, a little more sensitivity around of his face is eighty-se

# The End!