## Imports

In [1]:
#!/usr/bin/env python
# coding: utf-8


"""
Follows 
https://www.tensorflow.org/tutorials/text/text_classification_rnn
and 
https://www.tensorflow.org/tutorials/text/classify_text_with_bert
"""
import glob
import os
import pickle as pkl
import shutil
from pathlib import Path
import random
import re
import sys
from datetime import datetime
import logging

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
# For pre-trained embeddings:
import tensorflow_hub as hub
import tensorflow_text as text

## Track when we did the experiment

In [2]:
start = datetime.now()
print(start)

2021-02-03 19:38:22.668781


## Global training config

In [3]:
# How much it loads into memory for sampling - we're ok just loading everything at once as it isn't a lot of data
BUFFER_SIZE = 10000
# Batch for gradient averaging
BATCH_SIZE = 64
# Specify encoding of words
SUBSET_VOCAB_SIZE = 5000
START = start.strftime('%Y-%m-%d-%H%M')
OUTPUT_DIR = Path(f'/rds/general/user/al3615/home/RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/{START}')
Path.mkdir(OUTPUT_DIR)
GROSS_SYNOPSES_PATH = Path('/rds/general/user/al3615/home/RNN_for_movie_gross_prediction/complete10000_films_and_synopsis.pickle')

## Helper functions

In [4]:
def plot_graphs(history, metric, prefix=''):
    if prefix:
        prefix = f"{prefix}-"
    
    fig, ax1 = plt.subplots(figsize=(8.0, 5.0))
    
    color = "tab:red"
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel(metric, color=color)
    ax1.plot(history.history[metric], color=color)
    ax1.tick_params(axis="y", labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = "tab:blue"
    ax2.set_ylabel(
        "val_" + metric, color=color
    )  # we already handled the x-label with ax1
    ax2.plot(history.history["val_" + metric], color=color)
    ax2.tick_params(axis="y", labelcolor=color)

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.legend([metric, "val_" + metric])
    plt.savefig(OUTPUT_DIR/f"{prefix}{metric}_history.png", dpi=300)


def split_data_into_input_and_output(data):
    """Take given data of format from scraper [link] and return the inputs and outputs seperated.

    Args:
        data (list): A numpy array/list of named tuples which contains entries for 'gross',
        'title', 'synopsis' and 'year'.
    """
    # Did this cause wanted to be certain about preserving order but i think numpy does that for free
    data_in, data_out = list(zip(*[((x["synopsis"]), x["gross"]) for x in data]))
    return np.array(data_in), np.array(data_out)


def add_signal(data):
    """
    If the given data has no signal we cant fit a NN to it. As such, here we append how much the film grossed
    into the synopsis of each title.

    Args:
        data (list): A numpy array/list of named tuples which contains entries for 'gross',
        'title', 'synopsis' and 'year'.
    """
    for row in data:
        row["synopsis"] = row["synopsis"] + f' The film grossed ${row["gross"]}'


def clean_copy(data, min_length=10, max_length=50, min_earning=0,max_earning=np.exp(30)):
    cleaned_data = np.fromiter(
        (x for x in data if len(x["synopsis"].split()) > min_length), dtype=data.dtype
    )
    print(
        f"Crushed {len(data)} to {len(cleaned_data)} after removing sub {min_length} word synopses"
    )
    old_len = len(cleaned_data)

    cleaned_data = np.fromiter(
        (x for x in cleaned_data if len(x["synopsis"].split()) < max_length), dtype=data.dtype
    )
    print(
        f"Crushed {old_len} to {len(cleaned_data)} after removing super {max_length} word synopses"
    )
    old_len = len(cleaned_data)

    cleaned_data = np.fromiter(
        (x for x in cleaned_data if x['gross'] > min_earning), dtype=data.dtype
    )
    print(
        f"Crushed {old_len} to {len(cleaned_data)} after removing sub {min_earning} gross"
    )
    old_len = len(cleaned_data)

    cleaned_data = np.fromiter(
        (x for x in cleaned_data if x['gross'] < max_earning), dtype=data.dtype
    )
    print(
        f"Crushed {old_len} to {len(cleaned_data)} after removing super {max_earning} gross"
    )
    old_len = len(cleaned_data)

    return cleaned_data


def confusion_plot(lab, pred, name):
    """
    Helper function to pile on scatter plots of labels and predictions that are real numbers
    marking a helpful black line for the truth

    lab = label
    pred = prediction

    needs a call to plt.show() or plt.savefig() after it cumulatively builds this
    """
    plt.scatter(lab, lab, label="truth", s=2, color="black")
    plt.scatter(lab, pred, label=name, s=2)
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.xlabel("Truth")
    plt.ylabel("Prediction")
    
def data_split(data, valid_fraction, test_fraction, train_fraction=None):
    """
    Returns `data` split into (test_set, validation_set, training_set) where the 
    """
    
    if train_fraction is None:
        train_fraction = 1 - test_fraction - valid_fraction
    rng = np.random.default_rng()
    rng.shuffle(data)
    len_d = len(data)
    test_idx = int(len_d*test_fraction)
    valid_idx = test_idx + int(len_d*valid_fraction)
    # Just checking method is consistent
    train_idx = valid_idx + int(len_d*train_fraction)
    assert train_idx == len_d
    return (data[:test_idx], data[test_idx:valid_idx], data[valid_idx:])

## Load, clean, shuffle, split and cast data into Tensorflow compatible format

In [5]:
raw_data = pkl.load(open(GROSS_SYNOPSES_PATH, "rb"))
data = clean_copy(raw_data)
test, valid, train = data_split(raw_data, valid_fraction=0, test_fraction=0.15)

# Fraction of overall data
train_data_in, train_data_out = split_data_into_input_and_output(train)
# valid_data_in, valid_data_out = split_data_into_input_and_output(valid)
test_data_in, test_data_out = split_data_into_input_and_output(test)

# Make dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((train_data_in, train_data_out))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data_in, test_data_out))

# Prefetch parrallelising loading + execution (not huge so not necessary)
# Dont need padded_batch as we are encoding and padding later
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(5)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(5)

TRAIN_STEPS_PER_EPOCH = len(train_data_in) // BATCH_SIZE
# VALID_STEPS_PER_EPOCH = len(valid_data_in) // BATCH_SIZE
TEST_STEPS_PER_EPOCH = len(test_data_in) // BATCH_SIZE

Crushed 10000 to 9742 after removing sub 10 word synopses
Crushed 9742 to 9209 after removing super 50 word synopses
Crushed 9209 to 9209 after removing sub 0 gross
Crushed 9209 to 9209 after removing super 10686474581524.463 gross


## Build model

In [6]:
def build_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

    # Step 1: tokenize batches of text inputs.
    tokenize = hub.KerasLayer(preprocessor.tokenize, name='tokenizer')
    tokenized_inputs = [tokenize(text_input), ]

    # Step 2 (optional): modify tokenized inputs.
    pass

    # Step 3: pack input sequences for the Transformer encoder.
    seq_length = 50  # We filter out anything less earlier
    bert_pack_inputs = hub.KerasLayer(
        preprocessor.bert_pack_inputs,
        arguments=dict(seq_length=seq_length), name='input_packer')  # Optional argument.
    encoder_inputs = bert_pack_inputs(tokenized_inputs)

    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1", trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
#     pooled_output = outputs["pooled_output"]
    sequence_output = outputs["sequence_output"]
    net = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10,name='LSTM'),name='Bidirectional')(sequence_output)
    net = tf.keras.layers.Dense(64, activation='relu', name='hidden-1')(net)
    net = tf.keras.layers.Dense(32, activation='relu', name='hidden-2')(net)
    net = tf.keras.layers.Dense(1, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [7]:
pre_trained_model = build_model()

#TODO: See if this improves results
initial_learning_rate = 0.1
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=25*TRAIN_STEPS_PER_EPOCH,
    decay_rate=0.1,
    staircase=True)

pre_trained_model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    metrics=[tf.keras.metrics.MeanAbsoluteError(name='mae'),
            tf.keras.metrics.MeanAbsolutePercentageError(name='maep')]
)

In [8]:
with open(OUTPUT_DIR/'model_summary.txt', 'w') as f:
    pre_trained_model.summary()
    pre_trained_model.summary(print_fn=lambda x: f.write(f'{x}\n'), line_length=120)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
tokenizer (KerasLayer)          (None, None, None)   0           text[0][0]                       
__________________________________________________________________________________________________
input_packer (KerasLayer)       {'input_mask': (None 0           tokenizer[0][0]                  
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'pooled_output': (N 35068417    input_packer[0][0]               
                                                                 input_packer[0][1]    

## Train

In [9]:
checkpoint_dir = OUTPUT_DIR / "pre_trained_checkpoints"
checkpoint_path = checkpoint_dir / "{epoch}"
# Create a callback that saves the model's weights every epoch
# NOTE: Important to delete the ones we aren't interested in!
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=2*TRAIN_STEPS_PER_EPOCH,
)
csv_logger = tf.keras.callbacks.CSVLogger(OUTPUT_DIR / 'training.csv',append=True)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, mode='min', restore_best_weights=True)

class LearningRateLoggingCallback(tf.keras.callbacks.Callback):

    def on_epoch_begin(self, epoch, logs=None):
        lr = tf.keras.backend.get_value(self.model.optimizer.learning_rate)
        print(f'Learning rate {lr((epoch)*133)}')
lr = LearningRateLoggingCallback()

In [None]:
train_start = datetime.now()
pre_trained_history = pre_trained_model.fit(
    train_dataset,
    epochs=100,
    validation_data=test_dataset,
    validation_steps=TEST_STEPS_PER_EPOCH,
    callbacks=[cp_callback,csv_logger,lr],
    verbose=2,
)

Learning rate 0.10000000149011612
Epoch 1/100
133/133 - 148s - loss: 3729947436777472.0000 - mae: 30009608.0000 - maep: 618.1891 - val_loss: 3457763548069888.0000 - val_mae: 32851384.0000 - val_maep: 1590.1746
Learning rate 0.10000000149011612
Epoch 2/100

Epoch 00002: saving model to /rds/general/user/al3615/home/RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/2021-02-03-1938/pre_trained_checkpoints/2
133/133 - 147s - loss: 3236828819750912.0000 - mae: 32934534.0000 - maep: 1550.4521 - val_loss: 3458896882565120.0000 - val_mae: 33544576.0000 - val_maep: 1677.6145
Learning rate 0.10000000149011612
Epoch 3/100
133/133 - 146s - loss: 3237406224416768.0000 - mae: 33048640.0000 - maep: 1562.9899 - val_loss: 3457731067379712.0000 - val_mae: 32868538.0000 - val_maep: 1592.3894
Learning rate 0.10000000149011612
Epoch 4/100

Epoch 00004: saving model to /rds/general/user/al3615/home/RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/2021-02-03-1938/pre_trained_

In [None]:
print(f"Training took {datetime.now()-train_start}")

## Recover interrupted session

Model

In [None]:
# # # Loading existing checkpoint if needed
# home = '/rds/general/user/al3615/home'
# model_path = home + '/' + 'RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/2021-01-24-0401/pre_trained_checkpoints/0044/variables/variables.index'
# # pre_trained_model = tf.keras.models.load_model(model_path)
# # pre_trained_model.load_weights(model_path)

# # latest = tf.train.latest_checkpoint(model_dir)
# # print(latest)
# checkpoint = tf.train.Checkpoint(model=pre_trained_model) 
# checkpoint.restore(model_path).run_restore_ops()


History

In [None]:
# class History:
#     def __init__(self, h):
#         self.history = h

# import csv
# home = '/rds/general/user/al3615/home/'
# training_log = home + 'RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/2021-01-28-2142/training.csv'
# d = dict()
# with open(training_log) as f:
#     reader = csv.DictReader(f)
#     for row in reader:
#         for key in row:
#             if key not in d:
#                 d[key] = list()
#             d[key].append(float(row[key]))
#     for key in row:
#         d[key] = np.array(d[key])
# pre_trained_history = History(d)

## Evaluate performance

In [None]:
pred_train = pre_trained_model.predict(train_data_in)
pred_test = pre_trained_model.predict(test_dataset)
trained_mean = np.mean(train_data_out)

In [None]:
plt.figure(figsize=(11, 8), dpi=300)
confusion_plot(train_data_out, pred_train, f"train")
confusion_plot(test_data_out, pred_test, f"test")
plt.hlines(y=trained_mean, xmin=0.0, xmax=9e8, color='grey', linestyles='dashed')
plt.savefig(OUTPUT_DIR / "pretrained_confusion.png")

In [None]:
plot_graphs(pre_trained_history, "loss")
plot_graphs(pre_trained_history, "mae")
plot_graphs(pre_trained_history, "maep")

# Clean up checkpoints we don't want
Keeps latest checkpoint

In [None]:
# checkpoints_of_interest = list(range(15))
# weights_only = False
# home = '/rds/general/user/al3615/home/'
# checkpoint_dir = home + 'RNN_for_movie_gross_prediction/jobs/pretrained_embeddings/outputs/2021-01-25-0505/pre_trained_checkpoints'
# checkpoint_dir = Path(checkpoint_dir)
# checkpoints = list(checkpoint_dir.glob("[0-9]*"))
# checkpoints.sort(key=os.path.getctime, reverse=True)
# checkpoints.pop(0)
# if weights_only:
#     checkpoints.pop(0)
# if checkpoints:
#     print(checkpoints)
#     for ckpt in checkpoints:
#         ckpt_num = int(ckpt.stem)
#         if ckpt_num not in checkpoints_of_interest:
#             print(f"Deleting {ckpt_num}")
#             if ckpt.is_dir():
#                 shutil.rmtree(ckpt)
#             else:
#                 ckpt.unlink()