In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
import matplotlib.pyplot as plt


2025-09-06 13:51:26.921595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757166687.281012      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757166687.386608      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from datasets import load_dataset

dataset = load_dataset("imdb")
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

print(train_df.head())
print(train_df["label"].value_counts())


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
label
0    12500
1    12500
Name: count, dtype: int64


In [3]:
# Reviews and labels
X_train_texts = train_df['text'].values
y_train = train_df['label'].values
X_test_texts = test_df['text'].values
y_test = test_df['label'].values

# Tokenization and padding
max_words = 10000  # top words
max_len = 200      # max review length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_texts)

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

X_test_seq = tokenizer.texts_to_sequences(X_test_texts)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [4]:
model_lstm = Sequential([
    Embedding(input_dim=max_words, output_dim=128),  # removed input_length
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])


I0000 00:00:1757166720.987126      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1757166720.987776      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [5]:
model_lstm.build(input_shape=(None, max_len))
model_lstm.summary()

In [6]:
# IMDB BiLSTM with strong regularization — typically ~91–93% test acc
# Works on TensorFlow 2.x (>=2.9 recommended)

import os, random
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix

# ---- 1) Reproducibility ------------------------------------------------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
try:
    # TF >= 2.10
    tf.config.experimental.enable_op_determinism(True)
except Exception:
    pass

# ---- 2) Data: load & pad -----------------------------------------------------
VOCAB_SIZE = 20000
MAXLEN = 300

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
x_train = pad_sequences(x_train, maxlen=MAXLEN, padding="post", truncating="post")
x_test  = pad_sequences(x_test,  maxlen=MAXLEN, padding="post", truncating="post")
y_train = np.array(y_train, dtype="int32")
y_test  = np.array(y_test,  dtype="int32")

# ---- 3) Make a validation split from train ----------------------------------
# IMDB train has 25k samples. We'll keep 5k for validation (stratified).
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=5000, random_state=SEED)
train_idx, val_idx = next(sss.split(x_train, y_train))
X_tr, X_val = x_train[train_idx], x_train[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]

print(f"Train: {X_tr.shape}, Val: {X_val.shape}, Test: {x_test.shape}")

# ---- 4) Build model with regularization -------------------------------------
EMB_DIM = 128
LSTM_UNITS = 128
L2 = 1e-4
DROPOUT_DENSE = 0.5
SPATIAL_DROPOUT = 0.2

def build_model():
    inp = layers.Input(shape=(MAXLEN,), dtype="int32")
    x = layers.Embedding(VOCAB_SIZE, EMB_DIM, name="embed")(inp)
    # Drop entire embedding dimensions per timestep (word-dropout-like)
    x = layers.SpatialDropout1D(SPATIAL_DROPOUT)(x)

    # return_sequences=True so we can pool across time
    x = layers.Bidirectional(
        layers.LSTM(
            LSTM_UNITS, return_sequences=True,
            dropout=0.3,            # input dropout
            recurrent_dropout=0.0   # keep 0.0 to use fast cuDNN kernels if on GPU
        ),
        merge_mode="concat"
    )(x)

    # Global max pool over time is robust and regularizes well
    x = layers.GlobalMaxPool1D()(x)

    x = layers.Dense(
        64, activation="relu",
        kernel_regularizer=regularizers.l2(L2)
    )(x)
    x = layers.Dropout(DROPOUT_DENSE)(x)
    out = layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inp, out)
    opt = tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1.0)
    model.compile(
        optimizer=opt,
        loss="binary_crossentropy",
        metrics=[
            tf.keras.metrics.BinaryAccuracy(name="accuracy"),
            tf.keras.metrics.AUC(name="auc")
        ],
    )
    return model

model = build_model()
model.summary()

# ---- 5) Train with callbacks -------------------------------------------------
ckpt_path = "best_imdb_bilstm.keras"
callbacks = [
    EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-6, verbose=1),
    ModelCheckpoint(ckpt_path, monitor="val_auc", mode="max", save_best_only=True, verbose=1),
]

BATCH_SIZE = 64
EPOCHS = 15

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=callbacks
)

# ---- 6) Evaluate -------------------------------------------------------------
print("\nEvaluating on test set...")
test_loss, test_acc, test_auc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test AUC: {test_auc:.4f}")

# Detailed report (threshold=0.5)
y_pred_prob = model.predict(x_test, batch_size=256, verbose=0).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Train: (20000, 300), Val: (5000, 300), Test: (25000, 300)


Epoch 1/15


I0000 00:00:1757166733.521907      86 cuda_dnn.cc:529] Loaded cuDNN version 90300



Epoch 1: val_auc improved from -inf to 0.93793, saving model to best_imdb_bilstm.keras
313/313 - 20s - 65ms/step - accuracy: 0.7189 - auc: 0.8094 - loss: 0.5335 - val_accuracy: 0.8166 - val_auc: 0.9379 - val_loss: 0.4041 - learning_rate: 0.0010
Epoch 2/15

Epoch 2: val_auc improved from 0.93793 to 0.94723, saving model to best_imdb_bilstm.keras
313/313 - 11s - 36ms/step - accuracy: 0.8971 - auc: 0.9542 - loss: 0.2768 - val_accuracy: 0.8700 - val_auc: 0.9472 - val_loss: 0.3303 - learning_rate: 0.0010
Epoch 3/15

Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 3: val_auc did not improve from 0.94723
313/313 - 11s - 36ms/step - accuracy: 0.9385 - auc: 0.9794 - loss: 0.1807 - val_accuracy: 0.8668 - val_auc: 0.9397 - val_loss: 0.4255 - learning_rate: 0.0010
Epoch 4/15

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 4: val_auc did not improve from 0.94723
313/313 - 11s - 36ms/step - accuracy: 0.9617 - auc: 0.9906 - loss:

In [7]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-09-06 13:53:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-09-06 13:53:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-09-06 13:53:24--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove

In [8]:
import os, random, numpy as np, tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix

# ------------------ Reproducibility ------------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ------------------ Dataset ------------------
VOCAB_SIZE = 20000
MAXLEN = 300

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
x_train = pad_sequences(x_train, maxlen=MAXLEN, padding="post", truncating="post")
x_test  = pad_sequences(x_test,  maxlen=MAXLEN, padding="post", truncating="post")
y_train, y_test = np.array(y_train), np.array(y_test)

# validation split
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=5000, random_state=SEED)
train_idx, val_idx = next(sss.split(x_train, y_train))
X_tr, X_val = x_train[train_idx], x_train[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]
print(f"Train: {X_tr.shape}, Val: {X_val.shape}, Test: {x_test.shape}")

# ------------------ Load GloVe ------------------
# Download glove.6B.100d.txt (822 MB) from: http://nlp.stanford.edu/data/glove.6B.zip
GLOVE_DIM = 100
GLOVE_PATH = "/kaggle/working/glove.6B.100d.txt"

print("Loading GloVe embeddings...")
embeddings_index = {}
with open(GLOVE_PATH, "r", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

# map index→word for IMDB
word_index = imdb.get_word_index()
index_offset = 3
index_to_word = { (idx + index_offset): w for w, idx in word_index.items() if (idx + index_offset) < VOCAB_SIZE }

embedding_matrix = np.random.normal(scale=0.6, size=(VOCAB_SIZE, GLOVE_DIM)).astype("float32")
for i in range(VOCAB_SIZE):
    w = index_to_word.get(i)
    if w and w in embeddings_index:
        embedding_matrix[i] = embeddings_index[w]

# ------------------ Model ------------------
def build_model_glove():
    inp = layers.Input(shape=(MAXLEN,), dtype="int32")
    x = layers.Embedding(
        VOCAB_SIZE, GLOVE_DIM,
        weights=[embedding_matrix],
        trainable=False,    # start frozen
        name="embed_glove"
    )(inp)
    x = layers.SpatialDropout1D(0.3)(x)
    x = layers.Bidirectional(
        layers.LSTM(
            128, return_sequences=True,
            dropout=0.3, recurrent_dropout=0.0
        )
    )(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.5)(x)
    out = layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3, clipnorm=1.0),
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )
    return model

model = build_model_glove()
model.summary()

# ------------------ Training ------------------
callbacks = [
    EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
    ModelCheckpoint("best_imdb_glove.keras", monitor="val_auc", mode="max", save_best_only=True, verbose=1),
]

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=12,
    batch_size=64,
    verbose=2,
    callbacks=callbacks
)

# optional fine-tuning: unfreeze embeddings for 1–2 epochs
model.get_layer("embed_glove").trainable = True
model.compile(optimizer=tf.keras.optimizers.Adam(3e-4, clipnorm=1.0),
              loss="binary_crossentropy",
              metrics=["accuracy", tf.keras.metrics.AUC(name="auc")])
model.fit(X_tr, y_tr, validation_data=(X_val, y_val),
          epochs=2, batch_size=64, verbose=2,
          callbacks=[EarlyStopping(monitor="val_auc", mode="max", patience=1, restore_best_weights=True)])

# ------------------ Evaluation ------------------
print("\nEvaluating on test set...")
test_loss, test_acc, test_auc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test AUC: {test_auc:.4f}")

from sklearn.metrics import classification_report, confusion_matrix
y_pred_prob = model.predict(x_test, batch_size=256, verbose=0).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Train: (20000, 300), Val: (5000, 300), Test: (25000, 300)
Loading GloVe embeddings...
Loaded 400000 word vectors from GloVe.
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Epoch 1/12

Epoch 1: val_auc improved from -inf to 0.81342, saving model to best_imdb_glove.keras
313/313 - 16s - 50ms/step - accuracy: 0.5842 - auc: 0.6262 - loss: 0.6738 - val_accuracy: 0.7002 - val_auc: 0.8134 - val_loss: 0.5950 - learning_rate: 0.0010
Epoch 2/12

Epoch 2: val_auc improved from 0.81342 to 0.88063, saving model to best_imdb_glove.keras
313/313 - 10s - 33ms/step - accuracy: 0.7368 - auc: 0.8096 - loss: 0.5422 - val_accuracy: 0.8020 - val_auc: 0.8806 - val_loss: 0.4536 - learning_rate: 0.0010
Epoch 3/12

Epoch 3: val_auc improved from 0.88063 to 0.90653, saving model to best_imdb_glove.keras
313/313 - 10s - 33ms/step - accuracy: 0.7880 - auc: 0.8616 - loss: 0.4726 - val_accuracy: 0.8286 - val_auc: 0.9065 - val_loss: 0.4022 - learning_rate: 0.0010
Epoch 4/12

Epoch 4: val_auc improved from 0.90653 to 0.91423, saving model to best_imdb_glove.keras
313/313 - 10s - 33ms/step - accuracy: 0.8116 - auc: 0.8832 - loss: 0.4381 - val_accuracy: 0.8346 - val_auc: 0.9142 - val_loss

In [9]:
import os, random, numpy as np, tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix

# ------------------ Reproducibility ------------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ------------------ Dataset ------------------
VOCAB_SIZE = 20000
MAXLEN = 300

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
x_train = pad_sequences(x_train, maxlen=MAXLEN, padding="post", truncating="post")
x_test  = pad_sequences(x_test,  maxlen=MAXLEN, padding="post", truncating="post")
y_train, y_test = np.array(y_train), np.array(y_test)

# Validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=5000, random_state=SEED)
train_idx, val_idx = next(sss.split(x_train, y_train))
X_tr, X_val = x_train[train_idx], x_train[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]
print(f"Train: {X_tr.shape}, Val: {X_val.shape}, Test: {x_test.shape}")

# ------------------ Download & load GloVe 200d ------------------
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q -o glove.6B.zip  # overwrite if exists

GLOVE_PATH = "/kaggle/working/glove.6B.200d.txt"
GLOVE_DIM = 200

print("Loading GloVe embeddings...")
embeddings_index = {}
with open(GLOVE_PATH, "r", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

# Map index → word for IMDB
word_index = imdb.get_word_index()
index_offset = 3
index_to_word = { (idx + index_offset): w for w, idx in word_index.items() if (idx + index_offset) < VOCAB_SIZE }

embedding_matrix = np.random.normal(scale=0.6, size=(VOCAB_SIZE, GLOVE_DIM)).astype("float32")
for i in range(VOCAB_SIZE):
    w = index_to_word.get(i)
    if w and w in embeddings_index:
        embedding_matrix[i] = embeddings_index[w]

# ------------------ Model ------------------
def build_model_glove():
    inp = layers.Input(shape=(MAXLEN,))
    x = layers.Embedding(
        VOCAB_SIZE, GLOVE_DIM,
        weights=[embedding_matrix],
        trainable=False,
        name="embed_glove"
    )(inp)
    x = layers.SpatialDropout1D(0.3)(x)

    # Bidirectional stacked LSTM
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.3))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.2))(x)

    # Max + Average Pooling
    max_pool = layers.GlobalMaxPooling1D()(x)
    avg_pool = layers.GlobalAveragePooling1D()(x)
    x = layers.Concatenate()([max_pool, avg_pool])

    x = layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.5)(x)
    out = layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3, clipnorm=1.0),
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )
    return model

model = build_model_glove()
model.summary()

# ------------------ Callbacks ------------------
callbacks = [
    EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
    ModelCheckpoint("best_imdb_glove_200d.keras", monitor="val_auc", mode="max", save_best_only=True, verbose=1),
]

# ------------------ Training (frozen embeddings) ------------------
history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=12,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)

# ------------------ Fine-tune embeddings ------------------
model.get_layer("embed_glove").trainable = True
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4, clipnorm=1.0),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)
model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=2,
    batch_size=64,
    callbacks=[EarlyStopping(monitor="val_auc", mode="max", patience=1, restore_best_weights=True)],
    verbose=2
)

# ------------------ Evaluation ------------------
test_loss, test_acc, test_auc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test AUC: {test_auc:.4f}")

y_pred_prob = model.predict(x_test, batch_size=256, verbose=0).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Train: (20000, 300), Val: (5000, 300), Test: (25000, 300)
Loading GloVe embeddings...
Loaded 400000 word vectors from GloVe.


Epoch 1/12

Epoch 1: val_auc improved from -inf to 0.86689, saving model to best_imdb_glove_200d.keras
313/313 - 28s - 91ms/step - accuracy: 0.6184 - auc: 0.6786 - loss: 0.6465 - val_accuracy: 0.7460 - val_auc: 0.8669 - val_loss: 0.5064 - learning_rate: 0.0010
Epoch 2/12

Epoch 2: val_auc improved from 0.86689 to 0.91432, saving model to best_imdb_glove_200d.keras
313/313 - 20s - 65ms/step - accuracy: 0.7943 - auc: 0.8704 - loss: 0.4594 - val_accuracy: 0.8340 - val_auc: 0.9143 - val_loss: 0.3811 - learning_rate: 0.0010
Epoch 3/12

Epoch 3: val_auc improved from 0.91432 to 0.92189, saving model to best_imdb_glove_200d.keras
313/313 - 20s - 65ms/step - accuracy: 0.8250 - auc: 0.9022 - loss: 0.4043 - val_accuracy: 0.8408 - val_auc: 0.9219 - val_loss: 0.3623 - learning_rate: 0.0010
Epoch 4/12

Epoch 4: val_auc improved from 0.92189 to 0.92983, saving model to best_imdb_glove_200d.keras
313/313 - 21s - 66ms/step - accuracy: 0.8401 - auc: 0.9149 - loss: 0.3784 - val_accuracy: 0.8462 - val_au

In [10]:
import os, random, numpy as np, tensorflow as tf
from tensorflow.keras import layers, regularizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix

# ------------------ Reproducibility ------------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ------------------ Dataset ------------------
VOCAB_SIZE = 20000
MAXLEN = 350  # increase sequence length for better context

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)
x_train = pad_sequences(x_train, maxlen=MAXLEN, padding="post", truncating="post")
x_test  = pad_sequences(x_test,  maxlen=MAXLEN, padding="post", truncating="post")
y_train, y_test = np.array(y_train), np.array(y_test)

# Validation split
sss = StratifiedShuffleSplit(n_splits=1, test_size=5000, random_state=SEED)
train_idx, val_idx = next(sss.split(x_train, y_train))
X_tr, X_val = x_train[train_idx], x_train[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]
print(f"Train: {X_tr.shape}, Val: {X_val.shape}, Test: {x_test.shape}")

# ------------------ Load GloVe 300d ------------------
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q -o glove.6B.zip

GLOVE_PATH = "glove.6B.300d.txt"
GLOVE_DIM = 300

print("Loading GloVe embeddings...")
embeddings_index = {}
with open(GLOVE_PATH, "r", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

# Map index → word for IMDB
word_index = imdb.get_word_index()
index_offset = 3
index_to_word = { (idx + index_offset): w for w, idx in word_index.items() if (idx + index_offset) < VOCAB_SIZE }

embedding_matrix = np.random.normal(scale=0.6, size=(VOCAB_SIZE, GLOVE_DIM)).astype("float32")
for i in range(VOCAB_SIZE):
    w = index_to_word.get(i)
    if w and w in embeddings_index:
        embedding_matrix[i] = embeddings_index[w]

# ------------------ Attention Layer ------------------
class Attention(layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], 1),
                                 initializer="glorot_uniform",
                                 trainable=True)
        super(Attention, self).build(input_shape)
    def call(self, x):
        score = tf.matmul(x, self.W)  # (batch, timesteps, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = tf.reduce_sum(x * attention_weights, axis=1)
        return context_vector

# ------------------ Model ------------------
def build_model_glove_attention():
    inp = layers.Input(shape=(MAXLEN,))
    x = layers.Embedding(
        VOCAB_SIZE, GLOVE_DIM,
        weights=[embedding_matrix],
        trainable=False,  # freeze first
        name="embed_glove"
    )(inp)
    x = layers.SpatialDropout1D(0.4)(x)

    # Stacked BiLSTM
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True, dropout=0.4))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.3))(x)

    # Attention
    attn = Attention()(x)

    # Max + Avg pooling
    max_pool = layers.GlobalMaxPooling1D()(x)
    avg_pool = layers.GlobalAveragePooling1D()(x)

    # Concatenate
    x = layers.Concatenate()([attn, max_pool, avg_pool])
    x = layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4))(x)
    x = layers.Dropout(0.55)(x)  # increased dropout
    out = layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3, clipnorm=1.0),
        loss="binary_crossentropy",
        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
    )
    return model

model = build_model_glove_attention()
model.summary()

# ------------------ Callbacks ------------------
callbacks = [
    EarlyStopping(monitor="val_auc", mode="max", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-6, verbose=1),
    ModelCheckpoint("best_imdb_glove_300d_attention.keras", monitor="val_auc", mode="max", save_best_only=True, verbose=1),
]

# ------------------ Training (frozen embeddings) ------------------
history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=12,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)

# ------------------ Fine-tune embeddings ------------------
model.get_layer("embed_glove").trainable = True
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4, clipnorm=1.0),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)
model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=4,
    batch_size=64,
    callbacks=[EarlyStopping(monitor="val_auc", mode="max", patience=1, restore_best_weights=True)],
    verbose=2
)

# ------------------ Evaluation ------------------
test_loss, test_acc, test_auc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test AUC: {test_auc:.4f}")

y_pred_prob = model.predict(x_test, batch_size=256, verbose=0).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Train: (20000, 350), Val: (5000, 350), Test: (25000, 350)
Loading GloVe embeddings...
Loaded 400000 word vectors from GloVe.


Epoch 1/12

Epoch 1: val_auc improved from -inf to 0.88338, saving model to best_imdb_glove_300d_attention.keras
313/313 - 54s - 172ms/step - accuracy: 0.6199 - auc: 0.6786 - loss: 0.6473 - val_accuracy: 0.7398 - val_auc: 0.8834 - val_loss: 0.5193 - learning_rate: 0.0010
Epoch 2/12

Epoch 2: val_auc improved from 0.88338 to 0.92257, saving model to best_imdb_glove_300d_attention.keras
313/313 - 46s - 148ms/step - accuracy: 0.8001 - auc: 0.8732 - loss: 0.4592 - val_accuracy: 0.8454 - val_auc: 0.9226 - val_loss: 0.3624 - learning_rate: 0.0010
Epoch 3/12

Epoch 3: val_auc improved from 0.92257 to 0.93046, saving model to best_imdb_glove_300d_attention.keras
313/313 - 47s - 152ms/step - accuracy: 0.8303 - auc: 0.9054 - loss: 0.4011 - val_accuracy: 0.8514 - val_auc: 0.9305 - val_loss: 0.3433 - learning_rate: 0.0010
Epoch 4/12

Epoch 4: val_auc improved from 0.93046 to 0.93866, saving model to best_imdb_glove_300d_attention.keras
313/313 - 49s - 155ms/step - accuracy: 0.8456 - auc: 0.9178 - 