<a href="https://colab.research.google.com/github/Hydrogen-Spoiler-Blocker/back-end/blob/master/MLModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Spoiler blocker LSTM model

The files used to train the model are uploaded on our personal Google Drive. But can be found and downloaded on these links:  
Smaller dataset: https://www.kaggle.com/rmisra/imdb-spoiler-dataset?select=IMDB_reviews.json  
Mega dataset: https://www.kaggle.com/ebiswas/imdb-review-dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import pandas as pd
import numpy as np
import os.path
import zipfile
import tensorflow as tf
import matplotlib.pyplot as plt

Smaller dataset

In [None]:
# path = "/content/drive/MyDrive/IMDB_reviews.json.zip"
# with zipfile.ZipFile(path, 'r') as zip_ref:
#     zip_ref.extractall("/")

In [None]:
# df_load = pd.read_json("/IMDB_reviews.json", lines=True)

# df_shuffled = df_load.sample(frac=1)

# df_true = df_shuffled[df_shuffled['is_spoiler'] == True]
# df_false = df_shuffled[df_shuffled['is_spoiler'] == False]

# df_sliced = df_true.append(df_false)
# tfds_spoiler = (tf.data.Dataset.from_tensor_slices(
#     (
#     df_sliced['review_text'].values,
#     df_sliced['is_spoiler'].values
#     )
# )
# )

In [None]:
# len(tfds_spoiler)

Mega dataset

In [None]:
path = "/content/drive/MyDrive/(H)ydrogen (u)ranium (ge)rmanium (b)oron (ra)idum (in)dium (s)ulfur/Colab Notebooks/Mega_IMDB.zip"
if not os.path.isfile("/sample.json"):
  with zipfile.ZipFile(path, 'r') as zip_ref:
      zip_ref.extractall("/")

In [None]:
df_load1 = pd.read_json("/part-01.json")
df_load2 = pd.read_json("/part-02.json")
df_load3 = pd.read_json("/part-03.json")
df_load4 = pd.read_json("/part-04.json")
df_load5 = pd.read_json("/part-05.json")
df_load6 = pd.read_json("/part-06.json")

frames = [df_load1, df_load2, df_load3, df_load4, df_load5, df_load6]

df_load_big = pd.concat(frames, ignore_index=True)

del df_load1, df_load2, df_load3, df_load4, df_load5, df_load6

df_shuffled = df_load_big.sample(frac=1)

del df_load_big

df_true = df_shuffled[df_shuffled['spoiler_tag'] == 1].head(1100000)
df_false = df_shuffled[df_shuffled['spoiler_tag'] == 0].head(1100000)

del df_shuffled

df_sliced = df_true.append(df_false)

del df_true, df_false

tfds_spoiler = (tf.data.Dataset.from_tensor_slices(
    (
    df_sliced['review_detail'].values,
    df_sliced['spoiler_tag'].values
    )
)
)

del df_sliced

In [None]:
len(tfds_spoiler)

Import `matplotlib` and create a helper function to plot graphs:

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

Setup input pipeline


In [None]:
train_size = int(0.75 * len(tfds_spoiler))
test_size = int(0.25 * len(tfds_spoiler))

tfds_spoiler = tfds_spoiler.shuffle(len(tfds_spoiler))
train_spoiler = tfds_spoiler.take(train_size)
test_spoiler = tfds_spoiler.skip(train_size)
test_spoiler = test_spoiler.take(test_size)

Initially this returns a dataset of (text, label pairs):

In [None]:
spoiler_true = 0
spoiler_false = 0
for example, label in test_spoiler:
  if label.numpy() == True:
    spoiler_true += 1
  else:
    spoiler_false += 1

In [None]:
print(spoiler_true, spoiler_false)

Next shuffle the data for training and create batches of these `(text, label)` pairs:

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128

In [None]:
train_dataset = train_spoiler.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_spoiler.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

## Create the text encoder

In [None]:
VOCAB_SIZE=2000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return encoder(text), label

In [None]:
vec_train_dataset = train_dataset.map(vectorize_text)
vec_test_dataset = test_dataset.map(vectorize_text)

In [None]:
for example, label in vec_train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

## Create and train the model

In [None]:
# model = tf.keras.models.load_model("/content/drive/MyDrive/ML_Model")

# model.summary()

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(vec_train_dataset, epochs=15,
                    validation_data=vec_test_dataset,
                    validation_steps=10)

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.subplot(1,2,2)
plot_graphs(history, 'loss')

In [None]:
test_loss, test_acc = model.evaluate(vec_test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Predict on a sample text

In [None]:
sample_text = ('The Avengers go back in time to get the Infinity Stones before they are found at various times in the MCU (Natasha sacrifices herself so Clint can get the Soul Stone). Stark develops a gauntlet which the Hulk puts on and uses to snap back the beings who were killed by the original snap. Thanos arrives and wages a full on war against all the heroes from the MCU movies. When Stark and Thanos fight, Stark takes the stones and uses them to eliminate Thanos and his army so that the universe may live in peace. The power of the stones is too much, and Stark dies. After the funeral, Banner and Wilson help Rogers go back in time to return the stones to their places of origin. Rogers returns as an old man to give the shield to Wilson. We are then shown Rogers dancing with Peggy Carter, and they kiss as the movie ends. There are no mid or end credit scenes.')
predictions = model.predict([encoder([sample_text])])
print(predictions)

Save and export model to tensorflow JS

In [None]:
!pip install tensorflowjs
import tensorflowjs as tfjs
model.save("/content/drive/MyDrive/ML_Model")
tfjs.converters.save_keras_model(model, "/content/drive/MyDrive/JSON_Model/")

Save the vocabulary

In [None]:
with open("/content/drive/MyDrive/JSON_Model/vocabulary.txt", "w") as write_file:
  for line in vocab:
    write_file.write(line + '\n')