# 1. Imports

In [7]:
import os
os.environ["HF_HUB_DISABLE_TOKEN_WARNING"] = "1"

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

import warnings
warnings.simplefilter("ignore")

import random  # Import random module for shuffling data
import torch  # Import PyTorch for tensor computations
import torch.nn as nn  # Import neural network modules
import torch.optim as optim  # Import optimization algorithms
from torch.utils.data import DataLoader, Dataset  # Import PyTorch dataset utilities
from datasets import load_dataset  # Import function to load datasets
from transformers import AutoTokenizer  # Import tokenizer for text processing
from torchinfo import summary  # Import module for model summary

print("Import completed.")

Import completed.


# 2. Load Data and Preprocess Text

In [8]:
# Load the IMDB dataset from the Hugging Face datasets library
dataset = load_dataset("imdb")

# Load the tokenizer for BERT (bert-base-uncased) to process text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define tokenization function
def preprocess_text(examples):
    """
    Tokenizes input text using the BERT tokenizer and applies padding/truncation.
    """
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# Apply tokenization to the dataset
dataset = dataset.map(preprocess_text, batched=True)

# 3. Prepare Datasets and Dataloaders

In [9]:
def prepare_data(split, num_samples=5000):
    """
    Prepares a subset of the dataset by selecting random samples and converting them into tensors.
    """
    dataset_split = dataset[split]  # Select the dataset split (train or test)
    indices = list(range(len(dataset_split)))  # Create a list of indices for dataset
    random.seed(42)  # Set seed for reproducibility
    random.shuffle(indices)  # Shuffle dataset indices randomly
    subset_indices = indices[:num_samples]  # Select a subset of the data
    subset = [dataset_split[i] for i in subset_indices]  # Retrieve selected samples
    processed_data = [
        (
            torch.tensor(ex["input_ids"], dtype=torch.long),  # Convert tokenized input IDs into a PyTorch tensor
            torch.tensor(ex["label"], dtype=torch.long)  # Convert label (0 or 1) into a PyTorch tensor
        ) for ex in subset  # Iterate over each selected sample in the subset
    ]
    return processed_data  # Return processed dataset as list of tuples

# Prepare train and test datasets with a subset of 5000 samples each
train_data = prepare_data("train", 5000)  # Process training data
test_data = prepare_data("test", 5000)  # Process testing data

# Define custom dataset class
class IMDBDataset(Dataset):
    def __init__(self, data):
        """
        Initializes the dataset with the provided data.
        """
        self.data = data

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves the sample at the specified index.
        """
        return self.data[idx]

# Create dataset objects for training and testing
train_dataset = IMDBDataset(train_data)  # Initialize training dataset
test_dataset = IMDBDataset(test_data)  # Initialize testing dataset

# Define batch size for training and testing
BATCH_SIZE = 32  # Set batch size

# Create DataLoaders for efficient data loading during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)  # DataLoader for training
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)  # DataLoader for testing

print("Data preparation completed.")

Data preparation completed.


# 4. Define the TextCNN Model

In [10]:
# Define model hyperparameters
VOCAB_SIZE = tokenizer.vocab_size  # Get vocabulary size from tokenizer
EMBEDDING_DIM = 128  # Dimension of word embeddings
NUM_CLASSES = 2  # Number of output classes (positive/negative sentiment)
FILTER_SIZES = [3, 4, 5]  # Different filter sizes for convolution layers
NUM_FILTERS = 10  # Number of filters per convolutional layer
NUM_EPOCHS = 3  # Number of training epochs

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, filter_sizes, num_filters):
        """
        Initializes the TextCNN model with embedding, convolutional, and fully connected layers.
        It does not process input data but sets up the model structure.
        """
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes  # Apply different filter sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)  # Fully connected layer
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization

    def forward(self, x):
        """
        Define how the input data flows through the network.
        It applies the layers defined in __init__() to the input and computes the output.
        This is where the actual computation (like embedding lookup, convolution, activation functions,
        and classification) happens when the model is used.
        """
        x = self.embedding(x).unsqueeze(1)  # Convert input into embeddings and add a channel dimension
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # Apply convolution layers
        x = [torch.max(pool, dim=2)[0] for pool in x]  # Apply max pooling
        x = torch.cat(x, dim=1)  # Concatenate feature maps
        x = self.dropout(x)  # Apply dropout
        return self.fc(x)  # Output layer

# Initialize the TextCNN model with predefined parameters
model = TextCNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_CLASSES, FILTER_SIZES, NUM_FILTERS)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check device availability
model.to(device)  # Move model to selected device

TextCNN(
  (embedding): Embedding(30522, 128)
  (convs): ModuleList(
    (0): Conv2d(1, 10, kernel_size=(3, 128), stride=(1, 1))
    (1): Conv2d(1, 10, kernel_size=(4, 128), stride=(1, 1))
    (2): Conv2d(1, 10, kernel_size=(5, 128), stride=(1, 1))
  )
  (fc): Linear(in_features=30, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# 5. Train and Evaluate The Model

In [11]:
# Define loss function and optimizer for training
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)  # Adam optimizer with specified learning rate

def train_model():
    """
    Trains the model for a specified number of epochs.
    """
    model.train()  # Set model to training mode
    for epoch in range(NUM_EPOCHS):  # Loop through epochs
        total_loss = 0  # Initialize total loss
        for batch in train_loader:  # Iterate over training batches
            inputs, labels = batch  # Unpack input features and labels
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the correct device
            optimizer.zero_grad()  # Reset gradients
            outputs = model(inputs)  # Forward pass through the model
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights
            total_loss += loss.item()  # Accumulate loss
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")  # Print average loss per epoch

def evaluate_model():
    """
    Evaluates the model on the test dataset.
    """
    model.eval()  # Set model to evaluation mode
    correct = 0  # Initialize correct predictions count
    total = 0  # Initialize total sample count
    with torch.no_grad():  # Disable gradient computation during evaluation
        for batch in test_loader:  # Iterate over test batches
            inputs, labels = batch  # Unpack inputs and labels
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the correct device
            outputs = model(inputs)  # Forward pass
            predictions = torch.argmax(outputs, dim=1)  # Get predicted class labels
            correct += (predictions == labels).sum().item()  # Count correct predictions
            total += labels.size(0)  # Count total samples
    print(f"Test Accuracy: {correct / total:.4f}")  # Print test accuracy

# Train and evaluate the model
train_model()
evaluate_model()

Epoch 1/3, Loss: 0.7829
Epoch 2/3, Loss: 0.6722
Epoch 3/3, Loss: 0.6306
Test Accuracy: 0.7124


# 6. Improving Model Performance

Some hints:
* Hyperparameter tuning, such as increasing or decreasing batch size
* Experiment with different filter sizes and number of filters
* Train for more epochs and observe if performance improves or overfits