# Clickbait Detection with Transformer Models

This project builds and evaluate transformer-based models for detecting clickbait headlines. Clickbait refers to content with misleading or sensationalized headlines designed primarily to attract attention and encourage visitors to click on a link, often at the expense of accuracy or quality. Detecting clickbait automatically is an important NLP task with applications in content moderation and media literacy.


In [None]:
# install packages
!pip install transformers datasets wandb



In [None]:


# Import PyTorch
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import random
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset

# For experiment tracking
import wandb

# Fix the random seed for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

In [None]:


#set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using device: {device}")


Using device: cuda


## Utility Functions
These functions help with model setup, data processing, training, and evaluation.



In [None]:
# Function to load the dataset
def load_data():
    """
    Load the clickbait dataset from Hugging Face

    Returns:
        dataset: A dataset dictionary containing train, validation, and test splits
    """

    dataset = load_dataset("christinacdl/clickbait_notclickbait_dataset",
                      data_files={"train": "train.json",
                                  "test": "test.json",
                                  "validation": "val.json"})
    return dataset



In [None]:
# Function to initialize and return instance of Autotokenizer with the given model name
def get_tokenizer(model_name):
    """
    Get the appropriate tokenizer for the given model name

    Args:
        model_name: Name of the pre-trained model (e.g., 'bert-base-uncased')

    Returns:
        tokenizer: The tokenizer for the specified model
    """
 
    # Load and return the pre-trained Autotokenizer for the specified model name

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return tokenizer

In [7]:
# Tokenization function for data processing
def tokenize(batch, tokenizer):
    """
    Transform text data to tokenized format for model input

    Args:
        batch: Batch of examples from the dataset
        tokenizer: Tokenizer to use for encoding

    Returns:
        Dict with tokenized inputs and labels
    """
    sentences = [x['text'] for x in batch]
    labels = torch.LongTensor([x['label'] for x in batch])
    new_batch = dict(tokenizer(sentences, padding=True, truncation=True, return_tensors="pt"))
    new_batch['label'] = labels
    return new_batch

In [None]:
# Function to initialize wandb for experiment tracking
def init_wandb(config, project_name):
    """Initialize wandb with given config"""
    
    wandb.init(
        project=project_name,
        config=config
    )
    return wandb.config

In [None]:
# Training function
def train(model,
          train_dataset,
          val_dataset,
          num_epochs,
          batch_size,
          optimizer_cls,
          lr,
          weight_decay,
          device,
          tokenizer,
          use_wandb=False):
    """
    Train the model and track with wandb if specified

    Args:
        model: Model to train
        train_dataset: Training dataset
        val_dataset: Validation dataset
        num_epochs: Number of epochs to train for
        batch_size: Batch size for training
        optimizer_cls: Name of optimizer to use ('SGD', 'Adam', 'AdamW')
        lr: Learning rate
        weight_decay: Weight decay for regularization
        device: Device to train on
        tokenizer: Tokenizer for processing inputs
        use_wandb: Whether to log metrics to wandb

    Returns:
        Tuple of (trained model, training history)
    """
    #  Set the model to training mode and move it to the specified device
    model.train()
    model.to(device)


    dataloader = DataLoader(train_dataset, batch_size, shuffle=True,
                          collate_fn=lambda batch: tokenize(batch, tokenizer))

    #Initialize the optimizers based on the optimizer_cls parameter, with the specified learning rates and weight decays.
    if optimizer_cls == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_cls == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_cls == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    lossfn = nn.CrossEntropyLoss()  # Using CrossEntropyLoss which expects logits

    global_step = 0

    for e in range(num_epochs):
        epoch_loss_history = []
        epoch_acc_history = []

        # Training loop
        model.train()
        for i, batch in enumerate(dataloader):
            batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
            y = batch.pop('label')

            #  Implement forward pass and loss computation
            # 1. Pass the batch through the model to get logits
            logits = model(**batch)
            # 2. Calculate the loss using lossfn
            loss = lossfn(logits, y)
            # 3. Calculate the accuracy (percentage of correct predictions)
            acc = (logits.argmax(dim=-1) == y).float().mean().item()
            # 4. Append each epoch's loss and accuracy to epoch_loss_history and epoch_acc_history
            epoch_loss_history.append(loss.item())
            epoch_acc_history.append(acc)


            global_step += 1

            # Print every 100 steps
            if global_step % 100 == 0:
                print(f'Epoch: {e+1}, Step: {global_step}, Train Loss: {epoch_loss_history[-1]:.3e}, Train Accuracy: {epoch_acc_history[-1]:.3f}')

                # Log batch metrics to WandB
                if use_wandb:
                    wandb.log({
                        "global_step": global_step,  # Correct step tracking
                        "train_loss_step": epoch_loss_history[-1],  # Current batch loss
                        "train_accuracy_step": epoch_acc_history[-1],  # Current batch accuracy
                        "epoch": e + 1,
                    })

            # Implement backward pass and optimization step
            # 1. Zero the gradients
            optimizer.zero_grad()
            # 2. Backpropagate the loss
            loss.backward()
            # 3. Update the model parameters using the optimizer
            optimizer.step()

        # Evaluation on validation set
        # Set the model to Evaluation mode
        model.eval()
        val_loss, val_acc, _, _, _ = evaluate(model, val_dataset, batch_size, device, tokenizer)

        train_loss_history.append(np.mean(epoch_loss_history))
        train_acc_history.append(np.mean(epoch_acc_history))
        val_loss_history.append(val_loss)
        val_acc_history.append(val_acc)

        print(f'epoch: {e + 1}\t train_loss: {train_loss_history[-1]:.3e}\t train_accuracy:{train_acc_history[-1]:.3f}\t val_loss: {val_loss_history[-1]:.3e}\t val_accuracy:{val_acc_history[-1]:.3f}')

        # Log metrics to wandb if enabled
        if use_wandb:
            wandb.log({
                "epoch": e + 1,
                "train_loss": train_loss_history[-1],
                "train_accuracy": train_acc_history[-1],
                "val_loss": val_loss_history[-1],
                "val_accuracy": val_acc_history[-1]
            })

    return model, (train_loss_history, train_acc_history, val_loss_history, val_acc_history)


In [None]:
# Evaluation function
@torch.no_grad()
def evaluate(model, dataset, batch_size, device, tokenizer):
    """
    Evaluate model on dataset

    Args:
        model: Model to evaluate
        dataset: Dataset to evaluate on
        batch_size: Batch size for evaluation
        device: Device to run evaluation on
        tokenizer: Tokenizer for processing inputs

    Returns:
        Tuple of (loss, accuracy, predictions, labels, logits)
    """
    # Set the model to evaluation mode and move it to the specified device
    model.eval()
    model.to(device)

    dataloader = DataLoader(dataset, batch_size, shuffle=False,
                           collate_fn=lambda batch: tokenize(batch, tokenizer))
    lossfn = nn.CrossEntropyLoss()  # Using CrossEntropyLoss which expects logits

    loss_history = []
    acc_history = []
    all_preds = []
    all_labels = []
    all_logits = []

    for i, batch in enumerate(dataloader):
        batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
        y = batch.pop('label')

        # Implement the evaluation loop
        # Loop through batches in the dataloader
        for k, v in batch.items():
            batch[k] = v.to(device)
        # 1. Get model predictions by passing in the batch (logits)
        logits = model(**batch)
        # 2. Calculate loss
        loss = lossfn(logits, y)
        # 3. Get the predictions from the logits in the variable pred
        pred = logits.argmax(dim=-1)


        acc = (pred == y).float().mean()

        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(y.cpu().numpy())
        all_logits.extend(logits.cpu().numpy())

        loss_history.append(loss.item())
        acc_history.append(acc.item())

    # Calculate and return the evaluation metrics
    # Return the mean loss, mean accuracy, all predictions, all labels, and all logits
    return np.mean(loss_history), np.mean(acc_history), all_preds, all_labels, all_logits



In [None]:
# Function to load a test set and generate predictions
def predict_on_test_set(model, tokenizer, test_file_path, output_file_path, device):
    """
    Generate predictions on a test set and save to file

    Args:
        model: Trained model
        tokenizer: Tokenizer for the model
        test_file_path: Path to the test data file
        output_file_path: Path to save predictions
        device: Device to run inference on
    """
    # Load test data
    with open(test_file_path, 'r') as f:
        test_data = json.load(f)

    print(f"Loaded {len(test_data)} examples from {test_file_path}")

    # Make predictions
    #  Set the model to evaluation mode
    model.eval()
    model.to(device)

    # Initialize an empty list to store predictions
    predictions = []

    for item in test_data:
        # Tokenize the text and move to the correct device as variable inputs
        inputs = tokenizer(item['text'], padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get predictions
        #  Disable gradient propagation
        with torch.no_grad():
        #  Pass the inputs to the model to get logits
            logits = model(**inputs)
        #  Get the prediction from logits
            pred = logits.argmax(dim=-1).item()

        # Store prediction as a string (0 or 1)
        predictions.append(str(pred))

    # Write predictions to file - one prediction per line
    with open(output_file_path, 'w') as f:
        f.write('\n'.join(predictions))

    print(f"Predictions saved to {output_file_path}")

## Model Architecture
This section defines the model architecture for transformer-based text classification.


In [None]:
# Base Transformer Model class for text classification
class TransformerForTextClassification(nn.Module):
    def __init__(self, model_name, num_classes, freeze_base=False, hidden_size=128, num_layers=1):
        """
        Transformer model with a classification head

        Args:
            model_name: Name of the base transformer model (e.g., 'bert-base-uncased')
            num_classes: Number of output classes
            freeze_base: Whether to freeze the base model parameters
            hidden_size: Size of the hidden layers in the classifier
            num_layers: Number of hidden layers in the classifier
        """
        super().__init__()

        self.base_model = AutoModel.from_pretrained(model_name)


        # Freeze base model if specified
        self.base_model.requires_grad_(not freeze_base)

        if not freeze_base:
            self.unfreeze_top_k_layers(5)

        # Get the hidden size from the base model config
        base_hidden_size = self.base_model.config.hidden_size

        # Build classifier with variable number of hidden layers
        if num_layers == 1:
            self.classifier = nn.Sequential(
                # Build classifier with variable number of hidden layers
                # For num_layers=1, create a classifier with:
                #   - Linear layer from base_hidden_size to hidden_size
                torch.nn.Linear(base_hidden_size, hidden_size),
                #   - ReLU activation
                torch.nn.ReLU(),
                #   - Linear layer from hidden_size to num_classes
                torch.nn.Linear(hidden_size, num_classes)
            )
        elif num_layers == 2:
            self.classifier = nn.Sequential(
                #  For num_layers=2, add an additional hidden layer
                torch.nn.Linear(base_hidden_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, num_classes)
            )
        elif num_layers == 3:
            self.classifier = nn.Sequential(
                # For num_layers=3, add two additional hidden layers
                torch.nn.Linear(base_hidden_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, hidden_size),
                torch.nn.ReLU(),
                torch.nn.Linear(hidden_size, num_classes)
            )
        else:
            raise ValueError(f"Unsupported number of layers: {num_layers}")

    def unfreeze_top_k_layers(self, k=5):
        """
        Unfreezes the top k layers of a Transformer model (BERT or ModernBERT).

        Parameters:
            model: The Transformer model (e.g., BERT or ModernBERT).
            k: Number of top layers to unfreeze (default is 5).
        """
        # First, freeze all layers in the base model
        for param in self.base_model.parameters():
            param.requires_grad = False

        # Detect whether the model uses standard BERT or ModernBERT
        if hasattr(self.base_model, "encoder"):  # Standard BERT
            layers = self.base_model.encoder.layer
        elif hasattr(self.base_model, "layers"):  # ModernBERT
            layers = self.base_model.layers
        else:
            raise ValueError("Unrecognized model architecture: Cannot find encoder layers.")

        # Get the total number of layers
        total_layers = len(layers)

        # Unfreeze the last k layers
        for i in range(total_layers - k, total_layers):
            for param in layers[i].parameters():
                # Unfreeze this layer
                param.requires_grad = True

        print(f"Unfroze the last {k} layers out of {total_layers} total layers.")

    def forward(self, **base_model_kwargs):
        """Forward pass through the model"""
        outputs = self.base_model(**base_model_kwargs)
        # Use the pooled output for classification
        pooled_output = outputs.last_hidden_state[:, 0, :]

        # Return logits (not probabilities)
        logits = self.classifier(pooled_output)
        return logits

In [13]:
# Function to create model with specified architecture
def get_model(model_name, num_classes, freeze_base=False, hidden_size=128, num_layers=1):
    """Create model with specified architecture"""
    return TransformerForTextClassification(
        model_name=model_name,
        num_classes=num_classes,
        freeze_base=freeze_base,
        hidden_size=hidden_size,
        num_layers=num_layers
    )

### Dataset
We'll be using a dataset of headlines labeled as either clickbait (1) or not clickbait (0). The dataset comes from Hugging Face and includes training, validation, and test splits. You'll have an opportunity to explore the data distribution and characteristics before building your models.


In [14]:
# Load dataset
dataset = load_data()
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 43802
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8760
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2191
    })
})


In [15]:
# Look at some examples
print("\nExamples from training set:")
for i in range(3):
    print(f"Example {i}: {dataset['train'][i]}")

print("\nExamples from validation set:")
for i in range(3):
    print(f"Example {i}: {dataset['validation'][i]}")


Examples from training set:
Example 0: {'text': 'Alphabet Scraps Plan to Blanket Globe With Internet Balloons', 'label': 0}
Example 1: {'text': 'US Boy Scouts and hikers airlifted from wildfire in Utah', 'label': 0}
Example 2: {'text': "Here's What Happened When I Road Tripped Around Southern California For A Week", 'label': 1}

Examples from validation set:
Example 0: {'text': '27 Happy Gifts For People Who Love Jamaica', 'label': 1}
Example 1: {'text': 'How Adulthood Happens ', 'label': 1}
Example 2: {'text': 'President Donald Trump Has Historically Low Approval Ratings As He Nears 100-Day Mark', 'label': 0}


In [16]:
# Look at class distribution
train_labels = [example['label'] for example in dataset['train']]
val_labels = [example['label'] for example in dataset['validation']]
test_labels = [example['label'] for example in dataset['test']]

print("\nClass distribution:")
print(f"Training set: Clickbait: {train_labels.count(1)}, Not clickbait: {train_labels.count(0)}")
print(f"Validation set: Clickbait: {val_labels.count(1)}, Not clickbait: {val_labels.count(0)}")
print(f"Test set: Clickbait: {test_labels.count(1)}, Not clickbait: {test_labels.count(0)}")



Class distribution:
Training set: Clickbait: 16257, Not clickbait: 27545
Validation set: Clickbait: 813, Not clickbait: 1378
Test set: Clickbait: 3252, Not clickbait: 5508


## Explore tokenization

In [17]:
# Explore tokenization
bert_tokenizer = get_tokenizer("bert-base-uncased")
modernbert_tokenizer = get_tokenizer("answerdotai/ModernBERT-base")

In [18]:
print("\nTokenization examples:")
example_text = dataset['train'][8]['text']
print(f"Original text: '{example_text}'")
print(f"BERT tokenization: {bert_tokenizer.tokenize(example_text)}")
print(f"ModernBERT tokenization: {modernbert_tokenizer.tokenize(example_text)}")


Tokenization examples:
Original text: '15 Things You Never Noticed About Owning A Cat'
BERT tokenization: ['15', 'things', 'you', 'never', 'noticed', 'about', 'owning', 'a', 'cat']
ModernBERT tokenization: ['15', 'ĠThings', 'ĠYou', 'ĠNever', 'ĠNot', 'iced', 'ĠAbout', 'ĠOw', 'ning', 'ĠA', 'ĠCat']


##  Model Selection

compare the performance of two different transformer architectures for the clickbait detection task:

1. **BERT (bert-base-uncased)**: A widely-used transformer model developed by Google that has been pre-trained on a large corpus of English text.

2. **ModernBERT (answerdotai/ModernBERT-base)**: A more recent transformer variant that has been trained on newer text data and may have better performance on contemporary language patterns.

train and evaluate both models with the same baseline configuration to determine which architecture provides a stronger foundation for our clickbait detection system.

This comparison will help us understand:
- Which model better captures the linguistic patterns characteristic of clickbait
- Whether the newer ModernBERT has advantages over the classic BERT architecture for this specific application

select the better-performing model to use as the foundation for further refinement.

In [None]:
def run_model_selection():
    """Run model selection task comparing BERT and RoBERTa"""

    # Define models to compare
    model_configs = [
        {
            "name": "bert-base-uncased",
            "display_name": "BERT",
            "freeze_base": True
        },
        {
            "name": "answerdotai/ModernBERT-base",
            "display_name": "ModernBERT",
            "freeze_base": True
        }
    ]

    # Training parameters
    train_params = {
        "num_epochs": 3,
        "batch_size": 32,
        "optimizer_cls": "Adam",
        "lr": 1e-3,
        "weight_decay": 1e-4,
        "hidden_size": 128,
        "num_layers": 1
    }

    results = []

    for config in model_configs:
        model_name = config["name"]
        display_name = config["display_name"]
        print(f"\n{'='*50}")
        print(f"Training and evaluating {display_name} model")
        print(f"{'='*50}")

        tokenizer = get_tokenizer(model_name)

        # Initialize wandb
        wandb_config = {**config, **train_params}
        init_wandb(wandb_config, "clickbait-detection-task1")

        # Create model
        model = get_model(
            model_name=model_name,
            num_classes=2,
            freeze_base=config["freeze_base"],
            hidden_size=train_params["hidden_size"],
            num_layers=train_params["num_layers"]
        )

        # Print the number of trainable parameters in the model
        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Model trainable parameters: {num_params}")

        # Train model
        model, logs = train(
            model=model,
            train_dataset=dataset['train'],
            val_dataset=dataset['validation'],
            num_epochs=train_params["num_epochs"],
            batch_size=train_params["batch_size"],
            optimizer_cls=train_params["optimizer_cls"],
            lr=train_params["lr"],
            weight_decay=train_params["weight_decay"],
            device=device,
            tokenizer=tokenizer,
            use_wandb=True
        )

        # Evaluate on validation set
        val_loss, val_acc, _, _, _ = evaluate(
            model=model,
            dataset=dataset['validation'],
            batch_size=train_params["batch_size"],
            device=device,
            tokenizer=tokenizer
        )

        # Record results
        results.append({
            "model_name": model_name,
            "display_name": display_name,
            "val_accuracy": val_acc,
            "val_loss": val_loss,
            "logs": logs,
            "model": model,
            "tokenizer": tokenizer
        })

        wandb.finish()

    # Compare results
    print("\nModel Selection Results:")
    print(f"{'Model':<10} {'Validation Accuracy':<20} {'Validation Loss':<15}")
    print("-" * 45)

    for result in results:
        print(f"{result['display_name']:<10} {result['val_accuracy']:.4f}{' '*15} {result['val_loss']:.4f}")

    return results


In [20]:
# Run model selection
task1_results = run_model_selection()


Training and evaluating BERT model


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmariamsu[0m ([33mmariamsu-carnegie-mellon-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Model trainable parameters: 98690
Epoch: 1, Step: 100, Train Loss: 3.538e-01, Train Accuracy: 0.844
Epoch: 1, Step: 200, Train Loss: 2.389e-01, Train Accuracy: 0.906
Epoch: 1, Step: 300, Train Loss: 2.750e-01, Train Accuracy: 0.938
Epoch: 1, Step: 400, Train Loss: 1.916e-01, Train Accuracy: 0.938
Epoch: 1, Step: 500, Train Loss: 1.992e-01, Train Accuracy: 0.969
Epoch: 1, Step: 600, Train Loss: 3.457e-01, Train Accuracy: 0.844
Epoch: 1, Step: 700, Train Loss: 3.239e-01, Train Accuracy: 0.844
Epoch: 1, Step: 800, Train Loss: 1.829e-01, Train Accuracy: 0.938
Epoch: 1, Step: 900, Train Loss: 3.100e-01, Train Accuracy: 0.875
Epoch: 1, Step: 1000, Train Loss: 2.143e-01, Train Accuracy: 0.844
Epoch: 1, Step: 1100, Train Loss: 2.096e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1200, Train Loss: 2.914e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1300, Train Loss: 3.348e-01, Train Accuracy: 0.844
epoch: 1	 train_loss: 2.907e-01	 train_accuracy:0.884	 val_loss: 2.819e-01	 val_accuracy:0.895
Epoch: 2,

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train_accuracy,▁▇█
train_accuracy_step,▂▅▆▆▇▂▂▆▃▂▅▅▂▃▃▆▇▅▁▇▇▅▂▂▃▃▃█▆▅▃▅▆▆▆▅▂▅▁▁
train_loss,█▃▁
train_loss_step,▅▃▃▂▂▅▄▂▄▃▂▄▄▄▄▃▃▂█▂▁▃▅▄▅▆▃▂▂▃▃▅▂▂▂▄▃▂▄▅
val_accuracy,▆▁█
val_loss,█▂▁

0,1
epoch,3.0
global_step,4100.0
train_accuracy,0.89525
train_accuracy_step,0.8125
train_loss,0.26298
train_loss_step,0.4047
val_accuracy,0.89662
val_loss,0.27334



Training and evaluating ModernBERT model


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Model trainable parameters: 98690
Epoch: 1, Step: 100, Train Loss: 6.226e-01, Train Accuracy: 0.750
Epoch: 1, Step: 200, Train Loss: 5.580e-01, Train Accuracy: 0.719
Epoch: 1, Step: 300, Train Loss: 2.003e-01, Train Accuracy: 0.969
Epoch: 1, Step: 400, Train Loss: 3.300e-01, Train Accuracy: 0.906
Epoch: 1, Step: 500, Train Loss: 2.981e-01, Train Accuracy: 0.844
Epoch: 1, Step: 600, Train Loss: 2.607e-01, Train Accuracy: 0.844
Epoch: 1, Step: 700, Train Loss: 3.093e-01, Train Accuracy: 0.906
Epoch: 1, Step: 800, Train Loss: 1.684e-01, Train Accuracy: 0.938
Epoch: 1, Step: 900, Train Loss: 4.787e-01, Train Accuracy: 0.781
Epoch: 1, Step: 1000, Train Loss: 5.047e-01, Train Accuracy: 0.781
Epoch: 1, Step: 1100, Train Loss: 1.758e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1200, Train Loss: 1.091e-01, Train Accuracy: 0.969
Epoch: 1, Step: 1300, Train Loss: 1.455e-01, Train Accuracy: 0.938
epoch: 1	 train_loss: 2.988e-01	 train_accuracy:0.881	 val_loss: 2.708e-01	 val_accuracy:0.893
Epoch: 2,

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train_accuracy,▁▇█
train_accuracy_step,▂▁▇▆▄▄▆▆▃▃▆▇▆▃▃▆▅▆█▄▆▃▃█▇▆▆▆▆▆▆▅▆▃▅█▆▄▄▄
train_loss,█▂▁
train_loss_step,█▇▃▄▄▃▄▂▆▇▂▂▂▇▆▄▄▃▁▅▃▄▅▂▃▄▃▂▂▂▄▄▃▆▇▁▂▄▆▄
val_accuracy,▁▆█
val_loss,█▁▁

0,1
epoch,3.0
global_step,4100.0
train_accuracy,0.90056
train_accuracy_step,0.84375
train_loss,0.25759
train_loss_step,0.33301
val_accuracy,0.90166
val_loss,0.25353



Model Selection Results:
Model      Validation Accuracy  Validation Loss
---------------------------------------------
BERT       0.8966                0.2733
ModernBERT 0.9017                0.2535


In [None]:

# 1. Find the model with the highest validation accuracy in task1_results
best_result = max(task1_results, key=lambda x: x['val_accuracy'])

# 2. Extract the model, model_name, and tokenizer from the best result

# 3. Print information about which model will be used for further tasks


best_model = best_result['model']
best_model_name = best_result['display_name']
best_tokenizer = best_result['tokenizer']

print(f"\nBest model from Task 1: {best_model_name}")
print(f"Using {best_model_name} for further tasks")


Best model from Task 1: ModernBERT
Using ModernBERT for further tasks


##  Hyperparameter Tuning

Now that we have selected the best model architecture, let's tune its hyperparameters to optimize performance.

In this task, you'll experiment with different hyperparameter configurations to find the best model. You should explore variations in:

- **Hidden layer sizes**: Try different sizes for the hidden layers in your classifier (e.g., 64, 128, 256, 512)
- **Number of hidden layers**: Experiment with adding more layers to your classifier (e.g., 1, 2, 3 layers)
- **Batch sizes**: Test different batch sizes (e.g., 16, 32, 64) - note that larger batch sizes may cause memory issues
- **Learning rates**: Try different learning rates (e.g., 1e-3, 5e-4, 1e-4)
- **Freezing base parameters**: Experiment with keeping the whole base model frozen vs unfreezing just the top 5 layers of the base model.
- **Optimizer**: You can try different optimizers like Adam, AdamW, or SGD

You should run at least 5 different hyperparameter configurations and track their performance using wandb. Below is a template for setting up your experiments.


In [None]:
# Define hyperparameters configurations to test

hp_configs = [
    # Configuration 1 (baseline)
    {
        "config_name": "Baseline",
        "hidden_size": 128,
        "num_layers": 1,
        "batch_size": 32,
        "optimizer": "Adam",
        "learning_rate": 1e-3,
        "weight_decay": 1e-4,
        "freeze_base": True,
        "num_epochs": 5
    },

    # Configuration 2 
    {
        "config_name": "Config 2",
        "hidden_size": 512,
        "num_layers": 3,
        "batch_size": 16,
        "optimizer": "Adam",
        "freeze_base": False,
        "learning_rate": 5e-4,
        "weight_decay": 1e-4,  
        "num_epochs": 5        
    },

    # Configuration 3 
    {
        "config_name": "Config 3",
        "hidden_size": 256,
        "num_layers": 2,
        "batch_size": 32,
        "optimizer": "AdamW",
        "freeze_base": True,
        "learning_rate": 2e-5,
        "weight_decay": 1e-4,  
        "num_epochs": 5        
    },

    # Configuration 4 
    {
        "config_name": "Config 4",
        "hidden_size": 128,
        "num_layers": 1,
        "batch_size": 128,
        "optimizer": "Adam",
        "freeze_base": True,
        "learning_rate": 2e-3,
        "weight_decay": 1e-4,  
        "num_epochs": 5        
    },

    # Configuration 5 
    {
        "config_name": "Config 5",
        "hidden_size": 256,
        "num_layers": 2,
        "batch_size": 64,
        "optimizer": "AdamW",
        "freeze_base": True,
        "learning_rate": 1e-4,
        "weight_decay": 1e-4,  
        "num_epochs": 5        
    },

    # Configuration 6 
    {
        "config_name": "Config 6",
        "hidden_size": 384,
        "num_layers": 2,
        "batch_size": 32,
        "optimizer": "Adam",
        "freeze_base": False,
        "learning_rate": 8e-4,
        "weight_decay": 1e-4,  
        "num_epochs": 5       }
]


In [None]:
def run_hyperparameter_tuning(model_name, base_tokenizer):
    """
    Run hyperparameter tuning experiments

    Args:
        model_name: Name of the model to use
        base_tokenizer: Tokenizer for the model

    Returns:
        List of experiment results
    """
    print(f"\n{'='*50}")
    print(f"Running Hyperparameter Tuning for {model_name}")
    print(f"{'='*50}")

    results = []
    best_val_acc = 0
    best_config_idx = 0
    best_model = None

    # For each configuration in hp_configs list
    for i, config in enumerate(hp_configs):
        print(f"\nRunning experiment {i+1}/{len(hp_configs)}: {config['config_name']}")

        # Initialize wandb for this experiment
        wandb_config = {**config, "model_name": model_name}
        init_wandb(wandb_config, "clickbait-detection-task2")

        # Create model with this configuration
        model = get_model(
            model_name=model_name,
            num_classes=2,
            freeze_base=config["freeze_base"],
            hidden_size=config["hidden_size"],
            num_layers=config["num_layers"]
        )

        # Train model
        model, logs = train(
            model=model,
            train_dataset=dataset['train'],
            val_dataset=dataset['validation'],
            num_epochs=config["num_epochs"],
            batch_size=config["batch_size"],
            optimizer_cls=config["optimizer"],
            lr=config["learning_rate"],
            weight_decay=config["weight_decay"],
            device=device,
            tokenizer=base_tokenizer,
            use_wandb=True
        )

        # Evaluate on validation set
        val_loss, val_acc, _, _, _ = evaluate(
            model=model,
            dataset=dataset['validation'],
            batch_size=config["batch_size"],
            device=device,
            tokenizer=base_tokenizer
        )

        # Log final validation metrics
        wandb.log({
            "final_val_loss": val_loss,
            "final_val_accuracy": val_acc
        })

        # Finish wandb run
        wandb.finish()

        # Record results
        results.append({
            "config_name": config["config_name"],
            "hidden_size": config["hidden_size"],
            "num_layers": config["num_layers"],
            "batch_size": config["batch_size"],
            "learning_rate": config["learning_rate"],
            "weight_decay": config["weight_decay"],
            "optimizer": config["optimizer"],
            "val_loss": val_loss,
            "val_accuracy": val_acc,
            "model": model
        })

        # Keep track of best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_config_idx = i
            best_model = model

    # Display results in a table
    print("\nHyperparameter Tuning Results:")
    print("-" * 120)
    print(f"{'Config':<10} {'Hidden Size':<12} {'Layers':<8} {'Batch Size':<12} {'Learning Rate':<14} {'Weight Decay':<14} {'Optimizer':<10} {'Val Accuracy':<15}")
    print("-" * 120)

    for result in results:
        print(f"{result['config_name']:<10} {result['hidden_size']:<12} {result['num_layers']:<8} {result['batch_size']:<12} {result['learning_rate']:<14} {result['weight_decay']:<14} {result['optimizer']:<10} {result['val_accuracy']:.4f}")

    print("-" * 120)
    print(f"Best configuration: {results[best_config_idx]['config_name']} with validation accuracy: {best_val_acc:.4f}")

    return results, best_model, best_config_idx

In [24]:
# Run Task 2: Hyperparameter Tuning
# Run all hyperparameter experiments
best_model_name = 'answerdotai/ModernBERT-base'
tuning_results, best_tuned_model, best_config_idx = run_hyperparameter_tuning(best_model_name, best_tokenizer)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running Hyperparameter Tuning for answerdotai/ModernBERT-base

Running experiment 1/6: Baseline


Epoch: 1, Step: 100, Train Loss: 4.093e-01, Train Accuracy: 0.844
Epoch: 1, Step: 200, Train Loss: 2.605e-01, Train Accuracy: 0.875
Epoch: 1, Step: 300, Train Loss: 3.315e-01, Train Accuracy: 0.875
Epoch: 1, Step: 400, Train Loss: 2.737e-01, Train Accuracy: 0.938
Epoch: 1, Step: 500, Train Loss: 1.764e-01, Train Accuracy: 0.969
Epoch: 1, Step: 600, Train Loss: 2.086e-01, Train Accuracy: 0.906
Epoch: 1, Step: 700, Train Loss: 1.704e-01, Train Accuracy: 0.969
Epoch: 1, Step: 800, Train Loss: 2.068e-01, Train Accuracy: 0.906
Epoch: 1, Step: 900, Train Loss: 3.047e-01, Train Accuracy: 0.844
Epoch: 1, Step: 1000, Train Loss: 3.256e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1100, Train Loss: 1.592e-01, Train Accuracy: 0.969
Epoch: 1, Step: 1200, Train Loss: 3.329e-01, Train Accuracy: 0.875
Epoch: 1, Step: 1300, Train Loss: 2.820e-01, Train Accuracy: 0.906
epoch: 1	 train_loss: 2.995e-01	 train_accuracy:0.882	 val_loss: 2.771e-01	 val_accuracy:0.889
Epoch: 2, Step: 1400, Train Loss: 3.119e-01

0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆█████████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train_accuracy,▁▅▇██
train_accuracy_step,▃▄▄▅▅▄▅▇▃▇▄▁▁▄▅▂▆▄▆▄▄▄▄▃█▆▄▄▆▄▅▃▅▇▅▄▂▆▄▄
train_loss,█▄▂▂▁
train_loss_step,▆▄▄▃▃▃▅▅▂▅▅▂▃▂▅▆▄▄▆██▆▃▄▃▁▄█▂▂▅▃▅▄▅▅▆▂▃▃
val_accuracy,▁█▇█▅
val_loss,█▁▁▁▄

0,1
epoch,5.0
final_val_accuracy,0.89577
final_val_loss,0.26298
global_step,6800.0
train_accuracy,0.90568
train_accuracy_step,0.875
train_loss,0.24475
train_loss_step,0.21955
val_accuracy,0.89577
val_loss,0.26298


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running experiment 2/6: Config 2


Unfroze the last 5 layers out of 22 total layers.
Epoch: 1, Step: 100, Train Loss: 5.473e-01, Train Accuracy: 0.688
Epoch: 1, Step: 200, Train Loss: 2.790e-01, Train Accuracy: 0.938
Epoch: 1, Step: 300, Train Loss: 4.257e-01, Train Accuracy: 0.938
Epoch: 1, Step: 400, Train Loss: 4.118e-01, Train Accuracy: 0.875
Epoch: 1, Step: 500, Train Loss: 3.524e-01, Train Accuracy: 0.875
Epoch: 1, Step: 600, Train Loss: 1.811e-01, Train Accuracy: 0.875
Epoch: 1, Step: 700, Train Loss: 1.238e-01, Train Accuracy: 0.938
Epoch: 1, Step: 800, Train Loss: 1.998e-01, Train Accuracy: 0.875
Epoch: 1, Step: 900, Train Loss: 2.880e-01, Train Accuracy: 0.875
Epoch: 1, Step: 1000, Train Loss: 7.005e-02, Train Accuracy: 0.938
Epoch: 1, Step: 1100, Train Loss: 3.176e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1200, Train Loss: 4.648e-01, Train Accuracy: 0.812
Epoch: 1, Step: 1300, Train Loss: 9.019e-02, Train Accuracy: 1.000
Epoch: 1, Step: 1400, Train Loss: 2.994e-01, Train Accuracy: 0.875
Epoch: 1, Step: 1500,

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▃▃▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆█████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇████
train_accuracy,▁▄▆▇█
train_accuracy_step,▆▆▃█▆▆██▁▆▆▃▆▆▆█▆▃▆▆▃█▃█▆▆█▃█▁▁█▃█▆█▆▁▃█
train_loss,█▅▃▂▁
train_loss_step,█▅▃▃▃▂▅▃▄▆▃▄▄▅▄█▃▅▂▅▅▂▆▁▂▇▂▆▄▅█▂▆▄▂▄▂▄▂▃
val_accuracy,▄▁▂█▂
val_loss,█▅▅▁▇

0,1
epoch,5.0
final_val_accuracy,0.90827
final_val_loss,0.23648
global_step,13600.0
train_accuracy,0.92857
train_accuracy_step,1.0
train_loss,0.1822
train_loss_step,0.06915
val_accuracy,0.90827
val_loss,0.23648


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running experiment 3/6: Config 3


Epoch: 1, Step: 100, Train Loss: 7.049e-01, Train Accuracy: 0.469
Epoch: 1, Step: 200, Train Loss: 4.139e-01, Train Accuracy: 0.906
Epoch: 1, Step: 300, Train Loss: 4.832e-01, Train Accuracy: 0.875
Epoch: 1, Step: 400, Train Loss: 3.730e-01, Train Accuracy: 0.906
Epoch: 1, Step: 500, Train Loss: 3.548e-01, Train Accuracy: 0.781
Epoch: 1, Step: 600, Train Loss: 2.887e-01, Train Accuracy: 0.875
Epoch: 1, Step: 700, Train Loss: 2.835e-01, Train Accuracy: 0.906
Epoch: 1, Step: 800, Train Loss: 2.769e-01, Train Accuracy: 0.875
Epoch: 1, Step: 900, Train Loss: 3.454e-01, Train Accuracy: 0.875
Epoch: 1, Step: 1000, Train Loss: 3.618e-01, Train Accuracy: 0.812
Epoch: 1, Step: 1100, Train Loss: 3.907e-01, Train Accuracy: 0.781
Epoch: 1, Step: 1200, Train Loss: 2.479e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1300, Train Loss: 1.870e-01, Train Accuracy: 0.969
epoch: 1	 train_loss: 3.891e-01	 train_accuracy:0.837	 val_loss: 3.152e-01	 val_accuracy:0.872
Epoch: 2, Step: 1400, Train Loss: 2.837e-01

0,1
epoch,▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇████
train_accuracy,▁▆▇██
train_accuracy_step,▁▇▇▇▇▇███▆██▇▇█▇▇▇▇█▇▇▇▆▇▆█▇██▇▆▇▇▆█▅▆▇▇
train_loss,█▃▂▁▁
train_loss_step,█▅▄▄▃▂▂▄▂▄▁▂▂▂▂▂▄▂▂▃▃▄▂▃▂▁▄▁▂▂▃▄▃▁▄▂▁▃▂▃
val_accuracy,▁▁█▇█
val_loss,█▆▃▂▁

0,1
epoch,5.0
final_val_accuracy,0.88943
final_val_loss,0.26758
global_step,6800.0
train_accuracy,0.89628
train_accuracy_step,0.90625
train_loss,0.267
train_loss_step,0.28062
val_accuracy,0.88943
val_loss,0.26758


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running experiment 4/6: Config 4


Epoch: 1, Step: 100, Train Loss: 3.084e-01, Train Accuracy: 0.898
Epoch: 1, Step: 200, Train Loss: 3.420e-01, Train Accuracy: 0.859
Epoch: 1, Step: 300, Train Loss: 2.110e-01, Train Accuracy: 0.930
epoch: 1	 train_loss: 3.123e-01	 train_accuracy:0.875	 val_loss: 2.602e-01	 val_accuracy:0.898
Epoch: 2, Step: 400, Train Loss: 3.537e-01, Train Accuracy: 0.883
Epoch: 2, Step: 500, Train Loss: 2.079e-01, Train Accuracy: 0.938
Epoch: 2, Step: 600, Train Loss: 2.732e-01, Train Accuracy: 0.891
epoch: 2	 train_loss: 2.644e-01	 train_accuracy:0.898	 val_loss: 2.566e-01	 val_accuracy:0.896
Epoch: 3, Step: 700, Train Loss: 2.822e-01, Train Accuracy: 0.875
Epoch: 3, Step: 800, Train Loss: 2.406e-01, Train Accuracy: 0.906
Epoch: 3, Step: 900, Train Loss: 2.138e-01, Train Accuracy: 0.938
Epoch: 3, Step: 1000, Train Loss: 1.652e-01, Train Accuracy: 0.945
epoch: 3	 train_loss: 2.555e-01	 train_accuracy:0.901	 val_loss: 2.453e-01	 val_accuracy:0.906
Epoch: 4, Step: 1100, Train Loss: 2.984e-01, Train Acc

0,1
epoch,▁▁▁▁▃▃▃▃▅▅▅▅▅▆▆▆▆█████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train_accuracy,▁▇▇██
train_accuracy_step,▄▁▇▃▇▄▂▅▇█▄▄▂▅▄▅▇
train_loss,█▃▂▁▁
train_loss_step,▆█▃█▃▅▅▄▃▁▆▄▆▃▄▄▄
val_accuracy,▂▁▇█▆
val_loss,█▇▂▁▂

0,1
epoch,5.0
final_val_accuracy,0.90472
final_val_loss,0.24354
global_step,1700.0
train_accuracy,0.90432
train_accuracy_step,0.92969
train_loss,0.24654
train_loss_step,0.23768
val_accuracy,0.90472
val_loss,0.24354


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running experiment 5/6: Config 5


Epoch: 1, Step: 100, Train Loss: 4.817e-01, Train Accuracy: 0.812
Epoch: 1, Step: 200, Train Loss: 3.799e-01, Train Accuracy: 0.828
Epoch: 1, Step: 300, Train Loss: 3.112e-01, Train Accuracy: 0.891
Epoch: 1, Step: 400, Train Loss: 1.875e-01, Train Accuracy: 0.953
Epoch: 1, Step: 500, Train Loss: 2.163e-01, Train Accuracy: 0.938
Epoch: 1, Step: 600, Train Loss: 3.074e-01, Train Accuracy: 0.906
epoch: 1	 train_loss: 3.396e-01	 train_accuracy:0.863	 val_loss: 2.820e-01	 val_accuracy:0.883
Epoch: 2, Step: 700, Train Loss: 3.181e-01, Train Accuracy: 0.844
Epoch: 2, Step: 800, Train Loss: 3.654e-01, Train Accuracy: 0.875
Epoch: 2, Step: 900, Train Loss: 3.766e-01, Train Accuracy: 0.844
Epoch: 2, Step: 1000, Train Loss: 3.602e-01, Train Accuracy: 0.906
Epoch: 2, Step: 1100, Train Loss: 1.983e-01, Train Accuracy: 0.906
Epoch: 2, Step: 1200, Train Loss: 2.563e-01, Train Accuracy: 0.906
Epoch: 2, Step: 1300, Train Loss: 3.797e-01, Train Accuracy: 0.844
epoch: 2	 train_loss: 2.752e-01	 train_accu

0,1
epoch,▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train_accuracy,▁▆▇██
train_accuracy_step,▁▂▅▇▇▅▂▄▂▅▅▅▂▆▅▄▅▅▃▆▄▆▂▄▄▃▄▄██▆▅▅▄
train_loss,█▃▂▁▁
train_loss_step,█▆▅▃▃▅▅▆▆▆▃▄▆▂▄▄▄▄▅▄▄▂▅▄▃▄▅▅▁▁▃▃▄▅
val_accuracy,▁▆▇█▇
val_loss,█▅▄▁▃

0,1
epoch,5.0
final_val_accuracy,0.89988
final_val_loss,0.25529
global_step,3400.0
train_accuracy,0.9053
train_accuracy_step,0.875
train_loss,0.24497
train_loss_step,0.32453
val_accuracy,0.89988
val_loss,0.25529


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc



Running experiment 6/6: Config 6


Unfroze the last 5 layers out of 22 total layers.
Epoch: 1, Step: 100, Train Loss: 1.910e-01, Train Accuracy: 0.938
Epoch: 1, Step: 200, Train Loss: 3.633e-01, Train Accuracy: 0.875
Epoch: 1, Step: 300, Train Loss: 2.174e-01, Train Accuracy: 0.938
Epoch: 1, Step: 400, Train Loss: 2.679e-01, Train Accuracy: 0.906
Epoch: 1, Step: 500, Train Loss: 2.001e-01, Train Accuracy: 0.969
Epoch: 1, Step: 600, Train Loss: 7.827e-02, Train Accuracy: 1.000
Epoch: 1, Step: 700, Train Loss: 1.167e-01, Train Accuracy: 1.000
Epoch: 1, Step: 800, Train Loss: 2.795e-01, Train Accuracy: 0.875
Epoch: 1, Step: 900, Train Loss: 2.278e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1000, Train Loss: 1.819e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1100, Train Loss: 2.109e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1200, Train Loss: 1.507e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1300, Train Loss: 2.233e-01, Train Accuracy: 0.938
epoch: 1	 train_loss: 2.475e-01	 train_accuracy:0.908	 val_loss: 2.402e-01	 val_accuracy

0,1
epoch,▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆█████████
final_val_accuracy,▁
final_val_loss,▁
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train_accuracy,▁▄▆▇█
train_accuracy_step,▆▃▆▇█▃▆▆▅▆▁▆▆▅▁▂▇▇▃▆▅▆▃▅▇▇▆▆▂█▆█▆▃▆▆▅▇▇▃
train_loss,█▅▃▂▁
train_loss_step,▃▃▂▅▄▄▃▆▄▄▇█▄▃▃▂▄▂▃▆▂▅▂▅▂▆▃▅▃▃▃▁▃▃▄▃▃▃▁▃
val_accuracy,▂█▁▃█
val_loss,█▁█▃▆

0,1
epoch,5.0
final_val_accuracy,0.90845
final_val_loss,0.23733
global_step,6800.0
train_accuracy,0.92879
train_accuracy_step,0.875
train_loss,0.18423
train_loss_step,0.17457
val_accuracy,0.90845
val_loss,0.23733



Hyperparameter Tuning Results:
------------------------------------------------------------------------------------------------------------------------
Config     Hidden Size  Layers   Batch Size   Learning Rate  Weight Decay   Optimizer  Val Accuracy   
------------------------------------------------------------------------------------------------------------------------
Baseline   128          1        32           0.001          0.0001         Adam       0.8958
Config 2   512          3        16           0.0005         0.0001         Adam       0.9083
Config 3   256          2        32           2e-05          0.0001         AdamW      0.8894
Config 4   128          1        128          0.002          0.0001         Adam       0.9047
Config 5   256          2        64           0.0001         0.0001         AdamW      0.8999
Config 6   384          2        32           0.0008         0.0001         Adam       0.9085
-----------------------------------------------------------

##  Final Evaluation and Error Analysis


1. Train the best model configuration from Task 2 on the combined training and validation data
2. Evaluate this final model on the test set
3. Generate predictions for the provided held-out test set
4. Perform detailed error analysis to understand the model's strengths and weaknesses

In [None]:

# Replace parameters with those from  best configuration 
best_config = {
    "hidden_size":384 ,   
    "num_layers": 2,       
    "batch_size": 32,      
    "optimizer": "Adam",   
    "learning_rate": 8e-4, 
    "weight_decay": 1e-4,
    "freeze_base": False,    
    "num_epochs": 1
}

In [26]:
print("\nTraining final model with best configuration...")
final_model = get_model(
    model_name=best_model_name,
    num_classes=2,
    freeze_base=best_config["freeze_base"],
    hidden_size=best_config["hidden_size"],
    num_layers=best_config["num_layers"]
)


Training final model with best configuration...
Unfroze the last 5 layers out of 22 total layers.


In [27]:
final_model, _ = train(
    model=final_model,
    train_dataset=dataset['train'],
    val_dataset=dataset['validation'],
    num_epochs=best_config["num_epochs"],
    batch_size=best_config["batch_size"],
    optimizer_cls=best_config["optimizer"],
    lr=best_config["learning_rate"],
    weight_decay=best_config["weight_decay"],
    device=device,
    tokenizer=best_tokenizer,
    use_wandb=False
)

Epoch: 1, Step: 100, Train Loss: 2.865e-01, Train Accuracy: 0.875
Epoch: 1, Step: 200, Train Loss: 3.345e-01, Train Accuracy: 0.906
Epoch: 1, Step: 300, Train Loss: 3.551e-01, Train Accuracy: 0.875
Epoch: 1, Step: 400, Train Loss: 1.288e-01, Train Accuracy: 0.969
Epoch: 1, Step: 500, Train Loss: 2.033e-01, Train Accuracy: 0.938
Epoch: 1, Step: 600, Train Loss: 1.679e-01, Train Accuracy: 0.938
Epoch: 1, Step: 700, Train Loss: 2.392e-01, Train Accuracy: 0.906
Epoch: 1, Step: 800, Train Loss: 2.109e-01, Train Accuracy: 0.875
Epoch: 1, Step: 900, Train Loss: 2.334e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1000, Train Loss: 2.780e-01, Train Accuracy: 0.906
Epoch: 1, Step: 1100, Train Loss: 1.648e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1200, Train Loss: 2.269e-01, Train Accuracy: 0.938
Epoch: 1, Step: 1300, Train Loss: 1.634e-01, Train Accuracy: 0.938
epoch: 1	 train_loss: 2.508e-01	 train_accuracy:0.906	 val_loss: 2.308e-01	 val_accuracy:0.905


In [28]:
print("\nEvaluating final model on test set...")
test_loss, test_acc, test_preds, test_labels, test_logits = evaluate(
    final_model,
    dataset['test'],
    batch_size=32,
    device=device,
    tokenizer=best_tokenizer
)

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")



Evaluating final model on test set...
Test Accuracy: 0.9120
Test Loss: 0.2250


In [30]:
# Generate predictions for the held-out test set (test-DIST.json)
test_file_path = '/kaggle/input/second-test-data-dist/second-test-data-DIST.json'
if os.path.exists(test_file_path):
    print(f"\nFound test file: {test_file_path}")
    print("Generating predictions for the held-out test set...")

    predict_on_test_set(
        model=final_model,
        tokenizer=best_tokenizer,
        test_file_path=test_file_path,
        output_file_path='test-results.txt',
        device=device
    )

    # Verify the test-results.txt file was created successfully
    if os.path.exists('test-results.txt'):
        with open('test-results.txt', 'r') as f:
            predictions = f.read().strip().split('\n')
        print(f"SUCCESS: Created test-results.txt with {len(predictions)} predictions")
        print(f"Sample predictions (first 5): {predictions[:5] if len(predictions) >= 5 else predictions}")
    else:
        print("ERROR: Failed to create test-results.txt. Please check for errors.")
else:
    print(f"\nERROR: Test file {test_file_path} not found!")
    print("You need this file for your submission. Please make sure it's in your working directory.")
    print("If you're working in Colab, upload the test-DIST.json file to your session.")



Found test file: /kaggle/input/second-test-data-dist/second-test-data-DIST.json
Generating predictions for the held-out test set...
Loaded 882 examples from /kaggle/input/second-test-data-dist/second-test-data-DIST.json
Predictions saved to test-results.txt
SUCCESS: Created test-results.txt with 882 predictions
Sample predictions (first 5): ['0', '1', '1', '1', '1']


## Error Analysis


In [None]:
#  Look at individual examples in the valid split of the dataset

In [51]:
def analyze_validation_examples(model, validation_dataset, device, tokenizer):
    """
    Analyze the model's performance on the validation set,
    focusing on misclassified examples and error patterns.
    """
    print("Starting error analysis...")
    
    #validation examples to work with
    val_texts = [data['text'] for data in validation_dataset]
    val_labels = [data['label'] for data in validation_dataset]

    model.eval()

    predictions = []
    confidences = []
    probabilities = []
    
    # making predictions on validation set in batches
    batch_size = 32
    for i in range(0, len(val_texts), batch_size):
        batch_texts = val_texts[i:min(i+batch_size, len(val_texts))]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        #  predictions
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            conf_values, _ = torch.max(probs, dim=1)
        
        # results
        predictions.extend(preds.cpu().numpy())
        confidences.extend(conf_values.cpu().numpy())
        probabilities.extend(probs.cpu().numpy())
    
    # results df
    results_df = pd.DataFrame({
        "text": val_texts,
        "true_label": val_labels,
        "pred_label": predictions,
        "confidence": confidences,
        "prob_class_0": [p[0] for p in probabilities],
        "prob_class_1": [p[1] for p in probabilities]
    })
    # label predictions
    results_df["correct"] = results_df["true_label"] == results_df["pred_label"]

    # Calculate metrics
    accuracy = results_df["correct"].mean()
    misclassified = results_df[~results_df["correct"]]
    false_positives = misclassified[misclassified["pred_label"] == 1]  # Non-clickbait predicted as clickbait
    false_negatives = misclassified[misclassified["pred_label"] == 0]  # Clickbait predicted as non-clickbait
    
    # Print metrics
    print(f"\nValidation Metrics:")
    print(f"Total examples: {len(results_df)}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Total misclassified: {len(misclassified)} ({len(misclassified)/len(results_df):.4f})")
    print(f"False positives (Non-clickbait → Clickbait): {len(false_positives)}")
    print(f"False negatives (Clickbait → Non-clickbait): {len(false_negatives)}")
    
    # Confusion matrix
    cm = confusion_matrix(results_df["true_label"], results_df["pred_label"])
    print("\nConfusion Matrix:")
    print(cm)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(results_df["true_label"], results_df["pred_label"]))
    
    return results_df, misclassified, false_positives, false_negatives

In [52]:
results_df, misclassified, false_positives, false_negatives = analyze_validation_examples(
    model = best_model, 
    validation_dataset = dataset['validation'], 
    device=device, 
    tokenizer=best_tokenizer)

Starting error analysis...

Validation Metrics:
Total examples: 2191
Accuracy: 0.9014
Total misclassified: 216 (0.0986)
False positives (Non-clickbait → Clickbait): 71
False negatives (Clickbait → Non-clickbait): 145

Confusion Matrix:
[[1307   71]
 [ 145  668]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1378
           1       0.90      0.82      0.86       813

    accuracy                           0.90      2191
   macro avg       0.90      0.89      0.89      2191
weighted avg       0.90      0.90      0.90      2191



In [76]:
def analyze_error_patterns(results_df, misclassified, false_positives, false_negatives):
    """
    Analyze patterns in misclassified examples
    """
    print("\n" + "="*50)
    print("Error Pattern Analysis")
    print("="*50)
    
    # text length
    results_df["char_length"] = results_df["text"].apply(len)
    results_df["word_count"] = results_df["text"].apply(lambda x: len(x.split()))
    
    #  text length vs errors
    print("\nText Length Analysis:")
    correct_word_count = results_df[results_df["correct"]]["word_count"].mean()
    incorrect_word_count = results_df[~results_df["correct"]]["word_count"].mean()
    print(f"Average word count for correct predictions: {correct_word_count:.2f}")
    print(f"Average word count for incorrect predictions: {incorrect_word_count:.2f}")
    
   
    # confidence vs errors
    print("\nConfidence Analysis:")
    correct_conf = results_df[results_df["correct"]]["confidence"].mean()
    incorrect_conf = results_df[~results_df["correct"]]["confidence"].mean()
    print(f"Average confidence (correct predictions): {correct_conf:.4f}")
    print(f"Average confidence (incorrect predictions): {incorrect_conf:.4f}")
    
    #  high-confidence errors
    high_conf_threshold = 0.9
    high_conf_errors = misclassified[misclassified["confidence"] >= high_conf_threshold]
    # High confidence false positives (non-clickbait predicted as clickbait)
    high_conf_fp = high_conf_errors[high_conf_errors["pred_label"] == 1].sort_values("confidence", ascending=False)
    
    print(f"\nHigh Confidence False Positives (Non-clickbait as Clickbait, confidence ≥ {high_conf_threshold}):")
    if len(high_conf_fp) > 0:
        for i, row in enumerate(high_conf_fp.head(5).itertuples(), 1):
            print(f"{i}. \"{row.text}\"")
            print(f"   Confidence: {row.confidence:.4f}")
    
    # High confidence false negatives (clickbait predicted as non-clickbait)
    high_conf_fn = high_conf_errors[high_conf_errors["pred_label"] == 0].sort_values("confidence", ascending=False)
    
    print(f"\nHigh Confidence False Negatives (Clickbait as Non-clickbait, confidence ≥ {high_conf_threshold}):")
    if len(high_conf_fn) > 0:
        for i, row in enumerate(high_conf_fn.head(5).itertuples(), 1):
            print(f"{i}. \"{row.text}\"")
            print(f"   Confidence: {row.confidence:.4f}")

    def extract_words(texts):
        words = []
        for text in texts:
            tokens = re.findall(r'\b\w+\b', text.lower())
            words.extend([token for token in tokens if len(token) > 2])  # Filter for very short words
        return Counter(words)
    
    fp_words = extract_words(false_positives["text"])
    fn_words = extract_words(false_negatives["text"])
    
    # common errors
    print("\nMost common words in False Positives (Non-clickbait as Clickbait):")
    for word, count in fp_words.most_common(10):
        print(f"  '{word}': {count}")
    
    print("\nMost common words in False Negatives (Clickbait as Non-clickbait):")
    for word, count in fn_words.most_common(10):
        print(f"  '{word}': {count}")

        
     # word Analysis
    results_df["first_word"] = results_df["text"].apply(lambda x: x.split()[0].lower() if len(x.split()) > 0 else "")
    
    # error rates for common starting words
    starting_words = Counter(results_df["first_word"])
    common_starters = [word for word, count in starting_words.items() if count >= 10]
    
    starter_error_rates = {}
    for word in common_starters:
        word_examples = results_df[results_df["first_word"] == word]
        error_rate = 1 - word_examples["correct"].mean()
        starter_error_rates[word] = (error_rate, len(word_examples))
    
    # starting words with high error rates
    print("\nStarting words with highest error rates:")
    sorted_starters = sorted(starter_error_rates.items(), key=lambda x: x[1][0], reverse=True)
    for word, (error_rate, count) in sorted_starters[:5]:
        print(f"  '{word}': {error_rate:.4f} error rate (from {count} examples)")    
    return starter_error_rates

In [77]:
starter_error_rates = analyze_error_patterns(results_df, misclassified, false_positives, false_negatives)


Error Pattern Analysis

Text Length Analysis:
Average word count for correct predictions: 10.01
Average word count for incorrect predictions: 11.24

Confidence Analysis:
Average confidence (correct predictions): 0.9293
Average confidence (incorrect predictions): 0.7798

High Confidence False Positives (Non-clickbait as Clickbait, confidence ≥ 0.9):
1. "5 Andra Day Songs You Should Know"
   Confidence: 0.9995
2. "7 Dinners Under $10 You Should Make This Week"
   Confidence: 0.9959
3. "How Bethpage Black Was Mastered (For a Day) By a Club Pro"
   Confidence: 0.9956
4. "How To Beat The DEATH GAME In Funhouse"
   Confidence: 0.9942
5. "Watch This Guy Dynamite A Christmas Tree Just In Time For The Holidays"
   Confidence: 0.9931

High Confidence False Negatives (Clickbait as Non-clickbait, confidence ≥ 0.9):
1. "Francois Hollande Will Not Seek Re-Election as French President"
   Confidence: 0.9896
2. "Germany hunts possible accomplices of Berlin suspect, arrests in Tunisia"
   Confidence: 