In [60]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [12]:
df = pd.read_csv('IMDB Dataset.csv')

In [24]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])  # positive -> 1, negative -> 0
print(df['sentiment'].value_counts())  # Should be 0 and 1

sentiment
1    25000
0    25000
Name: count, dtype: int64


In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train[:5])

(40000,) (10000,)
39087    0
30893    0
45278    1
16398    0
13653    0
Name: sentiment, dtype: int64


In [15]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [16]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [26]:
X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

print("Original review:", X_train.iloc[0][:300])
print("Tokenized:", X_train_seq[0][:20])
print("Padded:", X_train_pad[0][:20])

Original review: That's what I kept asking myself during the many fights, screaming matches, swearing and general mayhem that permeate the 84 minutes. The comparisons also stand up when you think of the one-dimensional characters, who have so little depth that it is virtually impossible to care what happens to them.
Tokenized: [198, 49, 11, 802, 2161, 535, 303, 2, 108, 1908, 2039, 4325, 6551, 3, 816, 4845, 13, 1, 2, 1]
Padded: [ 145 1084   17   89    5  133 2872 8744   19   11  154 9254  100    5
    2 4021  303   12   18 1002]


In [27]:
print("Input shape to model:", X_train_pad.shape)
print("Target shape:", y_train.shape)

Input shape to model: (40000, 200)
Target shape: (40000,)


In [61]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32, input_length=200))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [62]:
model.build(input_shape=(None, 200))
model.summary()

In [63]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=512, validation_split=0.2)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.5649 - loss: 0.6724 - val_accuracy: 0.8572 - val_loss: 0.3340
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8847 - loss: 0.2793 - val_accuracy: 0.8819 - val_loss: 0.2826
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9523 - loss: 0.1408 - val_accuracy: 0.8734 - val_loss: 0.3176
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9819 - loss: 0.0664 - val_accuracy: 0.8711 - val_loss: 0.3705
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9975 - loss: 0.0173 - val_accuracy: 0.8717 - val_loss: 0.4178
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9988 - loss: 0.0087 - val_accuracy: 0.8681 - val_loss: 0.4734
Epoch 7/10
[1m63/63[0m [32m━━━━

In [64]:
model.evaluate(X_test_pad, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 896us/step - accuracy: 0.8700 - loss: 0.5356


[0.5259253978729248, 0.8682000041007996]