In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Sequential




In [2]:
from tensorflow.keras.datasets import imdb

In [3]:
imdb_data = imdb.load_data()
imdb_data

((array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
         list([1, 19

- Q: Why do we use `num_words=vocab_size` in imdb.load_data()?
- A: To keep only the top `vocab_size` most frequent words and ignore rare ones.

- Q: What happens to words beyond the vocab limit?
- A: They are replaced by the special token `2` (representing <UNK> / unknown).

- Q: Is it a problem if important words fall outside this limit?
- A: Yes, it can hurt performance if meaningful words like "again" are treated as unknown.

- Q: How to avoid this?
- A:
    ✅ Increase `num_words` to cover more vocabulary.
    ✅ Use Keras Tokenizer with `oov_token` to better handle unknown words.
    ✅ Use subword tokenization (e.g., WordPiece) in advanced models like BERT.

In [4]:
voc_size = 10000

(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words=voc_size)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [6]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [7]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [8]:
sent_length = 500

X_train_padded = pad_sequences(X_train,padding='pre' ,maxlen=sent_length)
X_test_padded = pad_sequences(X_test,padding='pre' ,maxlen=sent_length)

In [9]:
X_train_padded[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [10]:
dim = 128

model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length)) # Embedding layers
model.add(SimpleRNN(dim, activation='relu')) # SimpleRNN layers
model.add(Dense(1, activation='sigmoid')) # output layer




In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 128)          1280000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               32896     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1313025 (5.01 MB)
Trainable params: 1313025 (5.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])




In [13]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience = 5, restore_best_weights=True)
early_stopping

<keras.src.callbacks.EarlyStopping at 0x16a03a9faf0>

### 🔍 Training Parameters Explained with Example

Assume we have:
- `X_train` → 1,000 training samples
- `batch_size=32`
- `epochs=5`
- `validation_split=0.2`

---

#### 📌 What Happens?

**1. `validation_split=0.2`**
- 20% of data (200 samples) is **set aside** for validation.
- Remaining 800 samples used for training.
- The model never sees the validation data during training updates.

---

**2. `batch_size=32`**
- Data is processed in chunks of 32 samples.
- For 800 training samples:
  - Each epoch has `800 / 32 = 25` batches (steps).

---

**3. `epochs=5`**
- The model goes through **all 800 training samples** 5 times.
- That’s `5 × 25 = 125` total training steps (weight updates).
- After each epoch, performance is evaluated on the 200 validation samples.

---

#### 💡 Summary

| Parameter        | Meaning                                              |
|------------------|------------------------------------------------------|
| `validation_split` | Reserve % of training data for validation           |
| `batch_size`       | How many samples are processed before updating model |
| `epochs`           | How many times the model sees the full training data |


In [14]:
#y_train = np.array(y_train)

history = model.fit(X_train_padded,y_train, 
          epochs=10, batch_size=32, 
          validation_split=0.2, 
          callbacks=[early_stopping])

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
model.save('simpleRNN_imdb.h5')

  saving_api.save_model(
