<a href="https://colab.research.google.com/github/Jitendra4Jalwaniya/tf_learning/blob/main/site/en/tutorials/keras/imdb_classifier_by_chatgpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import os
import tarfile
import urllib.request
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# Step 1: Download and Extract Data
data_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
data_path = "aclImdb.tar.gz"
data_dir = "aclImdb"

if not os.path.exists(data_path):
    print("Downloading dataset...")
    urllib.request.urlretrieve(data_url, data_path)

Downloading dataset...


In [3]:
if not os.path.exists(data_dir):
    print("Extracting dataset...")
    with tarfile.open(data_path, "r:gz") as tar:
        tar.extractall()

Extracting dataset...


In [4]:
# Step 2: Load and Preprocess Data
def load_data(directory):
    data = []
    labels = []
    for label_type in ["pos", "neg"]:
        dir_path = os.path.join(directory, label_type)
        for fname in os.listdir(dir_path):
            if fname.endswith(".txt"):
                with open(os.path.join(dir_path, fname), "r", encoding="utf-8") as f:
                    data.append(f.read())
                    labels.append(1 if label_type == "pos" else 0)
    return data, labels

In [5]:
print("Loading training data...")
train_data, train_labels = load_data(os.path.join(data_dir, "train"))
print("Loading testing data...")
test_data, test_labels = load_data(os.path.join(data_dir, "test"))

Loading training data...
Loading testing data...


In [11]:
import numpy as np

# Convert labels to NumPy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [12]:
# Step 3: Tokenize and Pad Sequences
print("Tokenizing text data...")
max_words = 20000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)

Tokenizing text data...


In [13]:
X_train = tokenizer.texts_to_sequences(train_data)
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')

X_test = tokenizer.texts_to_sequences(test_data)
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

In [14]:
# Step 4: Build the Model
print("Building LSTM model...")
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

Building LSTM model...


In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
# Step 5: Train the Model
print("Training the model...")
model.fit(X_train, train_labels, epochs=5, batch_size=32, validation_split=0.2)

Training the model...
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 349ms/step - accuracy: 0.6322 - loss: 0.6551 - val_accuracy: 0.2562 - val_loss: 0.9377
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 347ms/step - accuracy: 0.7137 - loss: 0.5509 - val_accuracy: 0.2924 - val_loss: 0.8985
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 347ms/step - accuracy: 0.8007 - loss: 0.4286 - val_accuracy: 0.6708 - val_loss: 0.6672
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 347ms/step - accuracy: 0.9012 - loss: 0.2659 - val_accuracy: 0.8182 - val_loss: 0.4629
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 361ms/step - accuracy: 0.9413 - loss: 0.1714 - val_accuracy: 0.7642 - val_loss: 0.6900


<keras.src.callbacks.history.History at 0x7ef96cba7af0>

In [17]:
# Step 6: Evaluate the Model
print("Evaluating model...")
loss, accuracy = model.evaluate(X_test, test_labels)
print(f"Accuracy: {accuracy:.2f}")

# Save the Model
model.save("sentiment_lstm_model.h5")
print("Model saved!")

Evaluating model...
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 104ms/step - accuracy: 0.8872 - loss: 0.3114




Accuracy: 0.84
Model saved!
