<a href="https://colab.research.google.com/github/MeenaChandrasekar/TensorFlow_2025-/blob/main/IMDB_Text_Classification_Model_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import os
import re
import shutil
import string
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers
from tensorflow.keras import losses


In [17]:
import tensorflow as tf
import os

# Define dataset URL
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Download the dataset
dataset_path = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, cache_dir='.', cache_subdir='datasets')

# Extract dataset manually
import tarfile

dataset_extract_path = os.path.join('.', 'datasets', 'aclImdb')
if not os.path.exists(dataset_extract_path):
    with tarfile.open(dataset_path, "r:gz") as tar:
        tar.extractall(path=os.path.join('.', 'datasets'))

# Define dataset directory
dataset_dir = os.path.join('.', 'datasets', 'aclImdb')

# Verify if the dataset exists
if os.path.exists(dataset_dir):
    print("✅ Dataset successfully extracted!")
    print("Dataset Directory:", dataset_dir)
    print("Contents:", os.listdir(dataset_dir))
else:
    print("❌ Dataset extraction failed. Check the path.")


✅ Dataset successfully extracted!
Dataset Directory: ./datasets/aclImdb
Contents: ['README', 'train', 'imdbEr.txt', 'test', 'imdb.vocab']


In [18]:
print("Train Directory:", os.listdir(os.path.join(dataset_dir, "train")))
print("Test Directory:", os.listdir(os.path.join(dataset_dir, "test")))


Train Directory: ['unsupBow.feat', 'labeledBow.feat', 'pos', 'neg', 'urls_neg.txt', 'urls_unsup.txt', 'urls_pos.txt', 'unsup']
Test Directory: ['labeledBow.feat', 'pos', 'neg', 'urls_neg.txt', 'urls_pos.txt']


In [19]:
import shutil

# Define the path for the 'unsup' directory
remove_dir = os.path.join(dataset_dir, 'train', 'unsup')

# Remove the 'unsup' directory if it exists
if os.path.exists(remove_dir):
    shutil.rmtree(remove_dir)
    print("✅ Removed 'unsup' directory!")
else:
    print("⚠️ 'unsup' directory not found, skipping...")


✅ Removed 'unsup' directory!


In [20]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "train"),
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=seed
)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "train"),
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=seed
)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "test"),
    batch_size=batch_size
)

print("✅ Datasets created successfully!")


Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
✅ Datasets created successfully!


In [21]:
import re
import string
from tensorflow.keras import layers

def custom_standardization(input_text):
    # Convert to lowercase
    lowercase = tf.strings.lower(input_text)
    # Remove HTML tags
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    # Remove punctuation
    return tf.strings.regex_replace(stripped_html, "[%s]" % re.escape(string.punctuation), "")

# Define text vectorization layer
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length
)

# Apply vectorization to dataset
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

print("✅ Text vectorization ready!")


✅ Text vectorization ready!


In [22]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, 16),  # Word embedding layer
    layers.GlobalAveragePooling1D(),  # Pooling to reduce dimensions
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")  # Sigmoid for binary classification
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

print("✅ Model created successfully!")


✅ Model created successfully!


In [23]:
epochs = 10

history = model.fit(
    raw_train_ds.map(lambda x, y: (vectorize_layer(x), y)),
    validation_data=raw_val_ds.map(lambda x, y: (vectorize_layer(x), y)),
    epochs=epochs
)
print("✅ Model training complete!")


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5982 - loss: 0.6630 - val_accuracy: 0.8312 - val_loss: 0.4344
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8401 - loss: 0.3904 - val_accuracy: 0.8682 - val_loss: 0.3257
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8850 - loss: 0.2913 - val_accuracy: 0.8752 - val_loss: 0.3017
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9041 - loss: 0.2485 - val_accuracy: 0.8776 - val_loss: 0.2942
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.9138 - loss: 0.2223 - val_accuracy: 0.8646 - val_loss: 0.3125
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9138 - loss: 0.2167 - val_accuracy: 0.8796 - val_loss: 0.3006
Epoch 7/10
[1m625/625[0m 

In [24]:
loss, accuracy = model.evaluate(raw_test_ds.map(lambda x, y: (vectorize_layer(x), y)))

print(f"Test Accuracy: {accuracy:.4f}")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8341 - loss: 0.4343
Test Accuracy: 0.8308


In [25]:
def predict_review(review_text):
    review_vectorized = vectorize_layer(tf.convert_to_tensor([review_text]))
    prediction = model.predict(review_vectorized)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment

print(predict_review("This movie was fantastic! I loved every moment."))
print(predict_review("The movie was terrible. I regret watching it."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Negative


In [26]:
model.save("Text_classification_IMDB_model.h5")




In [27]:
model.save("Text_classification_IMDB_model.keras")


In [28]:
import tensorflow as tf

loaded_model = tf.keras.models.load_model("Text_classification_IMDB_model.keras")
loaded_model.summary()


  saveable.load_own_variables(weights_store.get(inner_path))


In [29]:
import tensorflow as tf

# Load model without the optimizer issue
loaded_model = tf.keras.models.load_model("Text_classification_IMDB_model.keras")

# Recompile the model (if needed)
loaded_model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

# Summary
loaded_model.summary()


In [30]:
model.save("Text_classification_IMDB_model.keras")
