In [1]:
!pip install tf-models-official


Collecting tf-models-official
  Downloading tf_models_official-2.18.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting Cython (from tf-models-official)
  Downloading Cython-3.0.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting ai-edge-litert>=1.0.1 (from tf-models-official)
  Downloading ai_edge_litert-1.2.0-cp311-cp311-manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting gin-config (from tf-models-official)
  Downloading gin_config-0.5.0-py3-none-any.whl.metadata (2.9 kB)
Collecting google-api-python-client>=1.6.7 (from tf-models-official)
  Downloading google_api_python_client-2.164.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting oauth2client (from tf-models-official)
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting opencv-python-headless (from tf-models-official)
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting py-cpuinf

In [2]:
import tensorflow as tf
import os
import urllib.request
import tarfile

# Define the dataset URL
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_path = "aclImdb_v1.tar.gz"

# Download the dataset
urllib.request.urlretrieve(url, dataset_path)

# Extract the dataset
with tarfile.open(dataset_path, "r:gz") as tar:
    tar.extractall()

# Verify the extracted files
os.listdir("aclImdb")

# Define dataset path
dataset_path = os.path.join(os.getcwd(), "aclImdb/")

# Function to count files in a directory
def count_files_in_directory(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

# Count files in each class directory
train_neg = count_files_in_directory(os.path.join(dataset_path, "train/neg"))
train_pos = count_files_in_directory(os.path.join(dataset_path, "train/pos"))
train_unsup = count_files_in_directory(os.path.join(dataset_path, "train/unsup"))
test_neg = count_files_in_directory(os.path.join(dataset_path, "test/neg"))
test_pos = count_files_in_directory(os.path.join(dataset_path, "test/pos"))

# Print the counts
print("Original dataset structure:")
print(f"Training - Negative reviews: {train_neg} files")
print(f"Training - Positive reviews: {train_pos} files")
print(f"Training - Unsupervised reviews: {train_unsup} files")
print(f"Test - Negative reviews: {test_neg} files")
print(f"Test - Positive reviews: {test_pos} files")

# Calculate total files
total_train = train_neg + train_pos + train_unsup
total_test = test_neg + test_pos

print(f"\nTotal training files: {total_train}")
print(f"Total test files: {total_test}")

Original dataset structure:
Training - Negative reviews: 12500 files
Training - Positive reviews: 12500 files
Training - Unsupervised reviews: 50000 files
Test - Negative reviews: 12500 files
Test - Positive reviews: 12500 files

Total training files: 75000
Total test files: 25000


In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import os
import urllib.request
import tarfile

# Define the dataset URL
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_path = "aclImdb_v1.tar.gz"

# Download the dataset if not already downloaded
if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)

# Extract the dataset if not already extracted
if not os.path.exists("aclImdb"):
    with tarfile.open(dataset_path, "r:gz") as tar:
        tar.extractall()

# Verify the extracted files
print("Extracted directories:", os.listdir("aclImdb"))

# Define parameters
batch_size = 32
seed = 42

# Define dataset path
dataset_path = os.path.join(os.getcwd(), "aclImdb/")

# Load training and validation datasets
def load_dataset():
    # Load full training data
    train_ds_full = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_path, "train"),
        batch_size=batch_size,
        label_mode='int',
        class_names=['neg', 'pos'],
        shuffle=True,
        seed=seed,
        validation_split=0.2,  # Add validation split
        subset='training'      # Specify this is training subset
    )

    # Create validation dataset
    val_ds = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_path, "train"),
        batch_size=batch_size,
        label_mode='int',
        class_names=['neg', 'pos'],
        shuffle=True,
        seed=seed,
        validation_split=0.2,  # Same split
        subset='validation'    # Specify this is validation subset
    )

    # Load test data
    test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_path, "test"),
        batch_size=batch_size,
        label_mode='int',
        class_names=['neg', 'pos'],
        shuffle=True,
        seed=seed
    )

    return train_ds_full, val_ds, test_ds

train_ds, val_ds, test_ds = load_dataset()

# Print dataset sizes
train_size = tf.data.experimental.cardinality(train_ds).numpy() * batch_size
val_size = tf.data.experimental.cardinality(val_ds).numpy() * batch_size
test_size = tf.data.experimental.cardinality(test_ds).numpy() * batch_size

print(f"Training dataset size: {train_size} samples")
print(f"Validation dataset size: {val_size} samples")
print(f"Test dataset size: {test_size} samples")

Extracted directories: ['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']
Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Training dataset size: 20000 samples
Validation dataset size: 5024 samples
Test dataset size: 25024 samples


In [4]:
# Calculate dataset sizes
train_size = sum(1 for _ in train_ds)
val_size = sum(1 for _ in val_ds)
test_size = sum(1 for _ in test_ds)

# Get batch shape
for text_batch, label_batch in train_ds.take(1):
    batch_text_shape = text_batch.shape
    batch_label_shape = label_batch.shape

# Display information
print(f"Training dataset batches: {train_size}, samples: {train_size * batch_size}")
print(f"Validation dataset batches: {val_size}, samples: {val_size * batch_size}")
print(f"Test dataset batches: {test_size}, samples: {test_size * batch_size}")
print(f"Batch text shape: {batch_text_shape}")
print(f"Batch label shape: {batch_label_shape}")
print(f"Number of classes: {len(train_ds.class_names)}")
print(f"Class names: {train_ds.class_names}")

Training dataset batches: 625, samples: 20000
Validation dataset batches: 157, samples: 5024
Test dataset batches: 782, samples: 25024
Batch text shape: (32,)
Batch label shape: (32,)
Number of classes: 2
Class names: ['neg', 'pos']


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras import layers
import tensorflow.keras.optimizers as optimizers

# Define the URLs for the BERT model and preprocessing
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

# Load the BERT preprocessing model
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

# Function to preprocess the dataset
def preprocess_text(text, label):
    return bert_preprocess_model(text), label

# Apply preprocessing to the datasets
train_ds = train_ds.map(lambda text, label: preprocess_text(text, label), num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(lambda text, label: preprocess_text(text, label), num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.map(lambda text, label: preprocess_text(text, label), num_parallel_calls=tf.data.AUTOTUNE)

# Custom layer to wrap the BERT encoder
class BertEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, tfhub_handle_encoder, **kwargs):
        super(BertEncoderLayer, self).__init__(**kwargs)
        self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

    def call(self, inputs):
        return self.encoder(inputs)

# Function to build the classifier model
def build_classifier_model(tfhub_handle_encoder):
    input_word_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_type_ids')

    encoder_inputs = {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    }

    bert_layer = BertEncoderLayer(tfhub_handle_encoder)
    encoder_outputs = bert_layer(encoder_inputs)

    pooled_output = encoder_outputs['pooled_output']

    # Add hidden layer
    hidden = tf.keras.layers.Dense(64, activation='relu')(pooled_output)
    dropout = tf.keras.layers.Dropout(0.3)(hidden)
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(dropout)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    return model

# Build the classifier model
model = build_classifier_model(tfhub_handle_encoder)

# Calculate steps_per_epoch
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
print(f"Steps per epoch: {steps_per_epoch}")

# Define number of training steps
epochs = 3
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

# Create an optimizer with warmup
optimizer = optimizers.AdamW(
    learning_rate=1e-5,
    weight_decay=0.01,
    epsilon=1e-08
)

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
    tf.keras.callbacks.ModelCheckpoint(
        filepath='best_model.keras',
        save_best_only=True,
        monitor='val_accuracy'
    )
]

# Optional: Print model summary
model.summary()

Steps per epoch: 625


In [None]:
# Train the model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks,
    steps_per_epoch=steps_per_epoch
)

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2042s[0m 3s/step - accuracy: 0.5596 - loss: 0.6944 - val_accuracy: 0.6840 - val_loss: 0.6078
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2024s[0m 3s/step - accuracy: 0.6725 - loss: 0.6073 - val_accuracy: 0.7192 - val_loss: 0.5727
Epoch 3/5
[1m348/625[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m11:56[0m 3s/step - accuracy: 0.7009 - loss: 0.5804

In [None]:
import tensorflow as tf
import os
from google.colab import files

# Define save path in Colab
save_dir = "/content/bert_finetuned_model"
os.makedirs(save_dir, exist_ok=True)

# Save in different formats
model.save(os.path.join(save_dir, "bert_finetuned.h5"))  # HDF5 format
model.save(os.path.join(save_dir, "bert_finetuned.keras"))  # Keras format
model.save(os.path.join(save_dir, "saved_model"))  # TensorFlow SavedModel format

# Download the HDF5 model to your PC
files.download(os.path.join(save_dir, "bert_finetuned.h5"))

# Download the Keras model to your PC
files.download(os.path.join(save_dir, "bert_finetuned.keras"))

# Zip the TensorFlow SavedModel (since folders can't be downloaded directly)
!zip -r /content/saved_model.zip /content/bert_finetuned_model/saved_model
files.download("/content/saved_model.zip")
