In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [3]:
# Load the dataset with top 10,000 most frequent words
(train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data(num_words=10000)

# Check shape of dataset
print(f"Train data: {len(train_data)}, Test data: {len(test_data)}")


Train data: 25000, Test data: 25000


In [5]:
# Load the dataset with top 10,000 most frequent words
(train_data, train_labels), (test_data, test_labels) = keras.datasets.imdb.load_data(num_words=10000)

# Check shape of dataset
print(f"Train data: {len(train_data)}, Test data: {len(test_data)}")


Train data: 25000, Test data: 25000


In [7]:
# Load word index mapping
word_index = keras.datasets.imdb.get_word_index()

# Shift index values for special tokens
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

# Reverse word index to decode reviews
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Function to decode integer-encoded reviews
def decode_review(text_ids):
    return ' '.join([reverse_word_index.get(i, '?') for i in text_ids])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [9]:
# Pad all sequences to length 256
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)


In [17]:
model = keras.Sequential()
model.add(layers.Embedding(input_dim=10000, output_dim=16, input_length=256))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.build(input_shape=(None, 256))
model.summary()





In [19]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [21]:
# Split part of training data for validation
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]


In [23]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)


Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.5356 - loss: 0.6924 - val_accuracy: 0.6016 - val_loss: 0.6876
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6380 - loss: 0.6840 - val_accuracy: 0.6925 - val_loss: 0.6720
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7066 - loss: 0.6633 - val_accuracy: 0.7048 - val_loss: 0.6429
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7391 - loss: 0.6294 - val_accuracy: 0.7572 - val_loss: 0.6020
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7632 - loss: 0.5837 - val_accuracy: 0.7883 - val_loss: 0.5531
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8046 - loss: 0.5310 - val_accuracy: 0.8107 - val_loss: 0.5032
Epoch 7/10
[1m30/30[0m [32m━━━━

In [25]:
results = model.evaluate(test_data, test_labels, verbose=2)
print(f"\nTest Loss: {results[0]:.4f}, Test Accuracy: {results[1]*100:.2f}%")


782/782 - 1s - 1ms/step - accuracy: 0.8508 - loss: 0.3804

Test Loss: 0.3804, Test Accuracy: 85.08%


In [27]:
# Predict on test data
predictions = model.predict(test_data)

# Show predictions with decoded reviews
for i in [0, 1, 2, 3, 10, 15]:
    print("="*60)
    print(f"Review #{i} (Actual: {'Positive' if test_labels[i] == 1 else 'Negative'})")
    print("Review Text:", decode_review(test_data[i]))
    print(f"Predicted Score: {predictions[i][0]:.4f}")
    print(f"Predicted Sentiment: {'Positive' if predictions[i][0] > 0.5 else 'Negative'}\n")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784us/step
Review #0 (Actual: Negative)
Review Text: <START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD