In [None]:
# Reference
# https://www.tensorflow.org/tutorials/keras/text_classification
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, callbacks

# -- IMPORT DATA --
dataset_dict = {}

# Add dictionary with all files and their relative paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        dataset_dict[filename] = os.path.join(dirname, filename)

In [None]:
# -- PREPROCESS DATA --

print(dataset_dict.keys())
df_reviews = pd.read_json(dataset_dict.get("IMDB_reviews.json"), lines=True)
print('count of reviews is', len(df_reviews))

In [None]:
# shuffle the dataset
unshuffled_X = df_reviews['review_text']
unshuffled_Y = df_reviews['is_spoiler'].astype(int)

print(unshuffled_X.shape)
print(unshuffled_Y.shape)

print(" ")
print("\033[0;31;47m Before shuffling \033[0m")
print(" ")
print(unshuffled_X[0])
print(unshuffled_Y[0])

permutation = list(np.random.permutation(unshuffled_X.shape[0]))

df_reviews_X = unshuffled_X[permutation].reset_index(drop=True)
df_reviews_Y = unshuffled_Y[permutation].reset_index(drop=True)

print(" ")
print("\033[0;31;47m After shuffling \033[0m")
print(" ")
print(df_reviews_X[0])
print(df_reviews_Y[0])

In [None]:
# ----- CLEAN DATA -----
import re
import string
from nltk.corpus import stopwords

# def custom_standardization(input_data):
#   lowercase = tf.strings.lower(input_data)
#   stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
#   return tf.strings.regex_replace(stripped_html,
#                                   '[%s]' % re.escape(string.punctuation),
#          

en_stops = set(stopwords.words('english'))

def clean_string(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Make lowercase
    text = text.lower()
    # Split string into words
    text = text.split(" ")
    # Remove stop words
    text = [word for word in text if not word in en_stops]
    # Convert list of words back to string for vectorisation
    text = ' '.join(text)
    
    return text

df_reviews_X = df_reviews_X.apply(lambda x: clean_string(x))

print(" ")
print("\033[0;31;47m clean reviews \033[0m")
print(" ")
print(df_reviews_X[0])

In [None]:
#Split data into train, valid, test

X_train, X_temp, y_train, y_temp = train_test_split(df_reviews_X, df_reviews_Y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25, random_state=2)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

# model trainable embedding

In [None]:
# ----- VECTORISATION ----- 
max_tokens = 10000
output_sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=None,
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=output_sequence_length,
)

vectorize_layer.adapt(df_reviews_X, batch_size=128)
# df_reviews_X_after_vectorized = vectorize_layer(df_reviews_X)

In [None]:
# # check vetorize layer
# target_index = 10
# end_point = 10
# for i in range(end_point):
#   if(df_reviews_X_after_vectorized[target_index][i]==0):
#     print('end')
#     break
#   else:
#     print(vectorize_layer.get_vocabulary()[1])
#     print(vectorize_layer.get_vocabulary()[df_reviews_X_after_vectorized[target_index][i]])


# embedding model

In [None]:
#Training model

embedding_dim = 50
model = tf.keras.Sequential([
    vectorize_layer,
    layers.Embedding(input_dim=(max_tokens + 1), output_dim=embedding_dim, mask_zero=True, input_length=output_sequence_length),
    layers.Bidirectional(layers.LSTM(128, return_sequences = True, dropout=0.5)),
    layers.GlobalMaxPool1D(),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation = 'sigmoid')])

model.summary()

from tensorflow.keras import layers
from tensorflow.keras import losses
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
# Calculate weights for each class since dataset is unbalanced
unique, count = np.unique(y_train, return_counts=True)

weight_for_0 = count[0] / len(y_train)
weight_for_1 = count[1] / len(y_train)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

# Add adaptive learning rate to reduce validation plateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)
early_stop = EarlyStopping(monitor="val_loss", patience=10)

history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=128, class_weight=class_weight, callbacks=[reduce_lr, early_stop], use_multiprocessing=True)

In [None]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# GloVe model

In [None]:
# prepare tokenizer
from keras.preprocessing.text import Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') # encode sequences of words with index
tokenizer.fit_on_texts(df_reviews_X) # create a hash map of numbers and words, word2idx & idx2word
sequences = tokenizer.texts_to_sequences(df_reviews_X) # shape = (# of docs, length of text)
print(sequences[0])
print(df_reviews_X[0])
vocabulary_size = len(tokenizer.word_counts)
print("The size of vocab.txt is", vocabulary_size)
vocabulary_size = 10000

def get_max_len(seq):
  buffer = seq.copy()
  max = 0
  for i in buffer:
    temp = len(i)
    if temp > max:
      max = temp
  return max

max_seq_len = get_max_len(sequences)
print(max_seq_len)

# hyper-parameters
hidden_dim = 50 # glove.6B.50d.txt "50d"
# load the whole embedding into memory
embeddings_index = dict()

# df_reviews = pd.read_json(dataset_dict.get('glove.6B.50d.txt), lines=True)

f = open(dataset_dict.get('glove.6B.50d.txt'), encoding='utf-8')
for line in f:
  values = line.split()
  word = values[0]
  embeddings_index[word] = np.array(values[1:], dtype='float32')
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size+1, hidden_dim))
j=0
for word, i in tokenizer.word_index.items():
  if i > vocabulary_size:
    break
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
  else:
    j = j + 1
print('unknown vocab count = ', j)

#Split data into train, valid, test

# since the max sequence length of the corpus is 4 (doc[9]),
# we are going to made the max_seq_len = 4 -> PADDING
from keras_preprocessing.sequence import pad_sequences

X_train_GloVe = tokenizer.texts_to_sequences(X_train)
X_val_GloVe = tokenizer.texts_to_sequences(X_val)
X_test_GloVe = tokenizer.texts_to_sequences(X_test)

max_seq_len = 400
X_train_GloVe = pad_sequences(X_train_GloVe, maxlen=max_seq_len, padding='pre')
X_val_GloVe = pad_sequences(X_val_GloVe, maxlen=max_seq_len, padding='pre')
X_test_GloVe = pad_sequences(X_test_GloVe, maxlen=max_seq_len, padding='pre')
print(len(X_train_GloVe[0]))

In [None]:
embedding_dim = 50
model_GloVe = tf.keras.Sequential([
    layers.Input(shape=(max_seq_len)),
    layers.Embedding(input_dim=vocabulary_size+1, output_dim=hidden_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False),
    layers.Bidirectional(layers.LSTM(128, return_sequences = True, dropout=0.5)),
    layers.GlobalMaxPool1D(),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation = 'sigmoid')])

model_GloVe.summary()

from tensorflow.keras import layers
from tensorflow.keras import losses
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

model_GloVe.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Nadam(),
              metrics=['accuracy'])

In [None]:
# Calculate weights for each class since dataset is unbalanced
unique, count = np.unique(y_train, return_counts=True)

weight_for_0 = count[0] / len(y_train)
weight_for_1 = count[1] / len(y_train)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

# Add adaptive learning rate to reduce validation plateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.00001)
early_stop = EarlyStopping(monitor="val_loss", patience=10)

# history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=128, class_weight=class_weight, callbacks=[reduce_lr, early_stop], use_multiprocessing=True)
history_GloVe = model_GloVe.fit(X_train_GloVe, y_train, epochs=50, validation_data=(X_val_GloVe, y_val), class_weight=class_weight, callbacks=[reduce_lr, early_stop], use_multiprocessing=True)

In [None]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(history_GloVe.history['accuracy'])
plt.plot(history_GloVe.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history_GloVe.history['loss'])
plt.plot(history_GloVe.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# test

In [None]:
print('test zone')
print('bidirectional LSTM using trainable embedding layer')
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

print('bidrectional LSTM using untrainable GloVe embedding layer')
loss, accuracy = model_GloVe.evaluate(X_test_GloVe, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
# # predict result
# pred_test = model.predict(X_test)
# pred_test[pred_test > 0.5] = 1
# pred_test[pred_test <= 0.5] = 0

# # pred_test_GloVe = model_GloVe.predict(X_test_GloVe)
# # pred_test_GloVe[pred_test_GloVe > 0.5] = 1
# # pred_test_GloVe[pred_test_GloVe <= 0.5] = 0

In [None]:
# index = 100
# print(X_test.tolist()[index])
# print(y_test.tolist()[index])
# print('Embedding result = ', pred_test[index])
# # print('GloVe result = ', pred_test_GloVe[index])

In [None]:
example1 = "Ever since i saw fincher's first film (alien 3) i knew i was witnessing special talent.  yeah, a lot of people badmouth alien 3, but it holds a special place in my heart.  every film fincher does surpasses his previous film about tenfold.Fight Club was a great cerebral thriller that combines great acting, beautiful direction, the blackest of dark comedy, and a wonderful story to make possibly one of the greatest films of all time.Fight club is about more than a bunch of sweaty guys getting together and beating each other sensless.  it is about society, and not knowing your place in it.  more than not knowing your place in it, actually, it is about not HAVING a place in society to begin with.in the dvd, edward norton discusses how the film relates to the graduate, which really, i think is very accurate.  both films have a lot of similar themes, and i think in the end of both, you get the same thing out of it. and you question yourself because of it.  definitely read the book by chuck palhinuk.  wonderful read.  and check out the score as composed by the dust brothers.  pure sonic bliss."

In [None]:
example1 = "it is about society and not knowing your place in it"

In [None]:
# tter = [example1]
# result = model.predict(tter)
# print(temp)

# # GloVe
# tter = tokenizer.texts_to_sequences(tter)
# result_GloVe = model_GloVe.predict(tter)
# print(result_GloVe)

# print("ideal answer : False")

In [None]:
example2 = "All-time funniest movie! Sort of! Waterboy is near comedy perfection and 1 of the greatest sports comedies and 1 of my personal favorite comedies. It's hilarious! The characters are great and fun! My Mama says, Mama says foozball is the Devil! Water sucks! It really, really sucks! Bobby Bouche is the all-time greatest foozball player in movie history! Kathy Bates, Henry Winkler and Fairuza Bulk are hilarious and completely perfect for the Waterboy. Adam Sandler is the man as Waterboy and this is definitely 1 of his best and most forgotten best comedies, just like Big Daddy. Only Waterboy is way funnier. Forget I even said something about Big Daddy. Watch Waterboy! W-w-wattterrbooyyy!! Big Daddy kinda sucks now that I think about it."

In [None]:
# tter = [example2]
# result = model.predict(tter)
# print(temp)

# # GloVe
# tter = tokenizer.texts_to_sequences(tter)
# result_GloVe = model_GloVe.predict(tter)
# print(result_GloVe)

# print("ideal answer : False")