In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import pickle
import gc

from tqdm import tqdm
tqdm.pandas()

In [12]:

movie_details = pd.read_csv('IMDB_movie_details_new.csv')
reviews = pd.read_csv('IMDB_reviews_new.csv')

In [13]:
data = pd.merge(movie_details, reviews, on='movie_id')

del movie_details, reviews

gc.collect()

0

In [14]:
data = data[['review_text', 'is_spoiler']]

# half_len = len(data) // 2
# data = data[:half_len]

In [15]:
gc.collect()

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
texts = tqdm(data['review_text'].values, desc='Fitting tokenizer')
tokenizer.fit_on_texts(texts)

# Print the number of tokens
num_tokens = len(tokenizer.word_index)
print("Number of tokens:", num_tokens)

texts = tqdm(data['review_text'].values, desc='Converting texts to sequences')
X = list(tokenizer.texts_to_sequences(texts))
X = pad_sequences(X)

Fitting tokenizer: 100%|█████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 1881.59it/s]


Number of tokens: 22239


Converting texts to sequences: 100%|█████████████████████████████████████████████| 5000/5000 [00:01<00:00, 3584.94it/s]


In [16]:

gc.collect()

embed_dim = 128
lstm_out = 196

In [17]:
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax')) # We use softmax for multi-class classification, change to sigmoid for binary classification
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# Split data into train and test
Y = pd.get_dummies(data['is_spoiler']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(3350, 1644) (3350, 2)
(1650, 1644) (1650, 2)


In [19]:
batch_size = 32
steps_per_epoch = len(X_train) // batch_size

# Define a data generator to load and process the data in batches
def data_generator(X, Y, batch_size):
    num_samples = len(X)
    indices = np.arange(num_samples)
    while True:
        np.random.shuffle(indices)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_indices = indices[start:end]
            batch_X = X[batch_indices]
            batch_Y = Y[batch_indices]
            yield batch_X, batch_Y

In [None]:
# Train model
model.fit(data_generator(X_train, Y_train, batch_size),
                    steps_per_epoch=steps_per_epoch,
                    epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7c5707d582b0>

In [None]:
# Save the model
model.save('/content/gdrive/My Drive/SpoilerBlockerExt/spoiler_detection_model.h5')