In [6]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

from tqdm import tqdm
tqdm.pandas()

In [2]:
movie_details = pd.read_json('IMDB_movie_details.json', lines=True)
reviews = pd.read_json('IMDB_reviews.json', lines=True)

In [3]:
data = pd.merge(movie_details, reviews, on='movie_id')

In [4]:
data = data[['review_text', 'is_spoiler']]

In [7]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
texts = tqdm(data['review_text'].values, desc='Fitting tokenizer')
tokenizer.fit_on_texts(texts)
texts = tqdm(data['review_text'].values, desc='Converting texts to sequences')
X = list(tokenizer.texts_to_sequences(texts))
X = pad_sequences(X)

Fitting tokenizer: 100%|█████████████████████████████████████████████████████| 573906/573906 [05:41<00:00, 1680.95it/s]
Converting texts to sequences: 100%|█████████████████████████████████████████| 573906/573906 [04:39<00:00, 2051.43it/s]


In [8]:
embed_dim = 128
lstm_out = 196

In [9]:
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax')) # We use softmax for multi-class classification, change to sigmoid for binary classification
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# Split data into train and test
Y = pd.get_dummies(data['is_spoiler']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(384517, 2519) (384517, 2)
(189389, 2519) (189389, 2)


In [None]:
# Train model
batch_size = 32
model.fit(X_train, Y_train, epochs=7, batch_size=batch_size, verbose=2)

In [None]:
# Save the model
model.save('spoiler_detection_model.h5')