# Step 1: Load the IMDB Dataset

In [2]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Use top 10,000 words
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


# Step 2: Preprocess Text Data

In [4]:
maxlen = 200  # maximum review length

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)


# Step 3: Build and Train LSTM Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [None]:
# Train the Model
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2
)


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 416ms/step - accuracy: 0.6924 - loss: 0.5589 - val_accuracy: 0.8642 - val_loss: 0.3384
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 361ms/step - accuracy: 0.8914 - loss: 0.2872 - val_accuracy: 0.7950 - val_loss: 0.4374
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 387ms/step - accuracy: 0.9059 - loss: 0.2461 - val_accuracy: 0.8604 - val_loss: 0.3327
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 373ms/step - accuracy: 0.9499 - loss: 0.1382 - val_accuracy: 0.8540 - val_loss: 0.3846
Epoch 5/5
[1m 99/313[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:15[0m 353ms/step - accuracy: 0.9652 - loss: 0.1078

# Step 4: Generate Confusion Matrix & Classification Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict on test data
y_pred = (model.predict(x_test) > 0.5).astype("int32")

# Confusion matrix and report
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)

print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)


# Step 5: Interpret the Precision-Recall Tradeoff

🔍 Why it Matters in Sentiment Analysis:
Precision: Out of all the reviews predicted as positive, how many were actually positive?

Recall: Out of all the actual positive reviews, how many did we catch?

📌 Real-world Examples:
In a movie review site, you may want high precision so that when a review is flagged as positive, it really is (to avoid recommending bad movies).

In a support ticket classifier, high recall might be preferred to catch as many negative sentiments as possible (so unhappy customers don’t get ignored).

The right balance depends on the use case. That’s why understanding and evaluating both precision and recall is crucial.