# Overview
This notebook contains the starter code for the 11-411/11-611 Clickbait Detection homework. It has minimal text so you can easily copy it to **handin.py** when you submit.  Please read all the comments in the code as there is important information in them.

In [None]:
!pip install torch
!pip install numpy

In [1]:
#This code block has just standard setup code for running in Python

# Import PyTorch
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
import numpy as np

# Fix the random seed for reproducability
torch.random.manual_seed(8942764)
torch.cuda.manual_seed(8942764)
np.random.seed(8942764)

# Please set your device by uncommenting the right version below

# On colab or on a machine with access to an Nvidia GPU  use the following setting
#device = 'cuda:0'

# if you have an Apple Silicon machine with a GPU, use the following setting
# this should about 3-4 times faster that running it on a plain CPU
device = 'mps'

# If you will use a cpu, this is the setting
# device='cpu'

# note that in handin.py these next two steps would need to be removed
# if you are going run this on you personal machine these would need to be done
# in the shell/terminal to update your python libraries

!pip install transformers
!pip install datasets

from transformers import AutoTokenizer, BertModel
from datasets import load_dataset




Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
Downloading regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl (284 kB)
Downloading safetensors-0.4.5-cp311-cp311-m

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the data set from the huggingface repositories

dataset = load_dataset("christinacdl/clickbait_notclickbait_dataset")
dataset


Generating train split: 100%|██████████| 43802/43802 [00:00<00:00, 565720.62 examples/s]
Generating validation split: 100%|██████████| 2191/2191 [00:00<00:00, 478382.10 examples/s]
Generating test split: 100%|██████████| 8760/8760 [00:00<00:00, 901602.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 43802
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 2191
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 8760
    })
})

In [3]:
# initialize pretrained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
# if you want you can look at some sample  data items
print(dataset["train"][8])
print(dataset["validation"][6])
print(dataset["test"][0])

{'label': 1, 'text': '15 Things You Never Noticed About Owning A Cat'}
{'label': 1, 'text': 'It\'s Been 9 Years Since J.T. Died On "Degrassi" And We\'re Still Not Over It'}
{'label': 1, 'text': 'ABC News quiz: How much can you recall?'}


In [5]:
# This dataset has 3 splits, train, validation and test, and each has a  text  and label.

# Data from the dataset can generally be accessed like a Python dict.


# Print the original sentence.
print('Original: ', dataset['train'][8]['text'])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(dataset['train'][8]['text']))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset['train'][8]['text'])))

Original:  15 Things You Never Noticed About Owning A Cat
Tokenized:  ['15', 'things', 'you', 'never', 'noticed', 'about', 'owning', 'a', 'cat']
Token IDs:  [2321, 2477, 2017, 2196, 4384, 2055, 19273, 1037, 4937]


In [6]:
#code for tokenizing training data
def tokenize(batch):
  '''
  Transform the text under the 'sentence' key to
    batch has the following structure:
    [
      {
        k1: v1,
        k2: v2,
        ...
      },
      ...
    ]
  '''
  sentences = [x['text'] for x in batch]
  labels = torch.LongTensor([x['label'] for x in batch])
  new_batch = dict(tokenizer(sentences, padding=True, truncation=True, return_tensors="pt"))
  new_batch['label'] = labels
  return new_batch

In [7]:
# This code evaluates a trained model on a dataset. It also uses  train() to train model
# You probably should not be making any changes to this code.
# During training, it will be printing some progress messages

@torch.no_grad()
def evaluate(model, dataset, batch_size, device, collate_fn=None):
  model = model.eval().to(device)
  dataloader = DataLoader(dataset, batch_size, shuffle=False, collate_fn=collate_fn)
  lossfn = nn.NLLLoss()

  loss_history = []
  acc_history = []
  for i, batch in enumerate(dataloader):
      batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
      y = batch.pop('label')

      logits = model(**batch)
      loss = lossfn(logits, y)

      pred = logits.argmax(1)
      acc = (pred == y).float().mean()
      loss_history.append(loss.item())
      acc_history.append(acc.item())
  return np.mean(loss_history), np.mean(acc_history)

def train(model,
          train_dataset,
          val_dataset,
          num_epochs,
          batch_size,
          optimizer_cls,
          lr,
          weight_decay,
          device,
          collate_fn=None):
  model = model.train().to(device)
  dataloader = DataLoader(train_dataset, batch_size, shuffle=True,
                          collate_fn=collate_fn)

  if optimizer_cls == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr, weight_decay=weight_decay)
  elif optimizer_cls == 'Adam':
    optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay)

  train_loss_history = []
  train_acc_history = []
  val_loss_history = []
  val_acc_history = []

  lossfn = nn.NLLLoss()
  for e in range(num_epochs):
    epoch_loss_history = []
    epoch_acc_history = []
    for i, batch in enumerate(dataloader):
      batch = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
      y = batch.pop('label')

      logits = model(**batch)
      loss = lossfn(logits, y)

      pred = logits.argmax(1)
      acc = (pred == y).float().mean()

      epoch_loss_history.append(loss.item())
      epoch_acc_history.append(acc.item())

      if (i % 100 == 0):
        print(f'epoch: {e}\t iter: {i}\t train_loss: {np.mean(epoch_loss_history):.3e}\t train_accuracy:{np.mean(epoch_acc_history):.3f}')
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    val_loss, val_acc = evaluate(model, val_dataset, batch_size, device, collate_fn=collate_fn)

    train_loss_history.append(np.mean(epoch_loss_history))
    train_acc_history.append(np.mean(epoch_acc_history))
    val_loss_history.append(val_loss.item())
    val_acc_history.append(val_acc.item())
    print(f'epoch: {e}\t train_loss: {train_loss_history[-1]:.3e}\t train_accuracy:{train_acc_history[-1]:.3f}\t val_loss: {val_loss_history[-1]:.3e}\t val_accuracy:{val_acc_history[-1]:.3f}')

  return model, (train_loss_history, train_acc_history, val_loss_history, val_acc_history)


In [8]:
# This code defines the test classification class using BERT.
# The classifier is defined on top of the final layer of BERT.
# The classifier has 1 hidden layer with 128 hidden nodes though we have found that
# using a smaller number of hidden nodes does not make much difference,

class BertForTextClassification(nn.Module):
  def __init__(self, bert_pretrained_config_name, num_classes, freeze_bert=False):
    '''
    BeRT with a classification MLP
    args:
    - bert_pretrained_config_name (str): model name from huggingface hub
    - num_classes (int): number of classes in the classification task
    - freeze_bert (bool): [default False] If true gradients are not computed for
                          BeRT's parameters.
    '''
    super().__init__()
    self.bert = BertModel.from_pretrained(bert_pretrained_config_name)
    self.bert.requires_grad_(not freeze_bert)
    self.classifier = nn.Sequential(
        nn.Linear(self.bert.config.hidden_size, 128),
        nn.ReLU(),
        nn.Linear(128, num_classes),
        nn.LogSoftmax(dim=-1)
    )


  def forward(self, **bert_kwargs):
     output=self.bert(**bert_kwargs)
     cls_embed = output.pooler_output
     logits = self.classifier(cls_embed)
     return logits

In [9]:
# This is where fine-tuning of the classifier happens.
# Here we are training with batch size 32 for 5 epochs.
# At the end of each batch you see the training loss and training accuracy.

# Using a larger batch-size may cause memory issues with the GPU so we suggest you do not increase it.

# At the end of each epoch, you also see validation loss and validation accuracy.
# Change the device as described above if you will not be using a GPU

# If you run this block multiple times for some reason, uncomment the following lines to set
# the random seed for reproducability

#torch.random.manual_seed(8942764)
#torch.cuda.manual_seed(8942764)
#np.random.seed(8942764)

bert_cls = BertForTextClassification('bert-base-uncased', 2, freeze_bert=True)

print(f'num_trainable_params={sum([p.numel() for p in bert_cls.parameters() if p.requires_grad])}\n')

bert_cls, bert_cls_logs = train(bert_cls, dataset['train'], dataset['validation'],
                                num_epochs=5, batch_size=32, optimizer_cls='Adam',
                                lr=1e-3, weight_decay=1e-4, device=device,
                                collate_fn=tokenize)

# this is where you run the test data (from huggingface) over the trained model and compute test loss and test accuracy
print('\n')
print('Starting test run')
test_loss, test_acc=evaluate(bert_cls,dataset['test'],batch_size=32, device=device, collate_fn=tokenize)
print(f'Test Complete.\t Test Loss: {test_loss:.3e}\t Test Accuracy: {test_acc:.3f}')


num_trainable_params=98690

epoch: 0	 iter: 0	 train_loss: 7.324e-01	 train_accuracy:0.375
epoch: 0	 iter: 100	 train_loss: 5.528e-01	 train_accuracy:0.714
epoch: 0	 iter: 200	 train_loss: 4.880e-01	 train_accuracy:0.764
epoch: 0	 iter: 300	 train_loss: 4.554e-01	 train_accuracy:0.787
epoch: 0	 iter: 400	 train_loss: 4.356e-01	 train_accuracy:0.800
epoch: 0	 iter: 500	 train_loss: 4.198e-01	 train_accuracy:0.810
epoch: 0	 iter: 600	 train_loss: 4.096e-01	 train_accuracy:0.818
epoch: 0	 iter: 700	 train_loss: 3.986e-01	 train_accuracy:0.824
epoch: 0	 iter: 800	 train_loss: 3.925e-01	 train_accuracy:0.828
epoch: 0	 iter: 900	 train_loss: 3.878e-01	 train_accuracy:0.831
epoch: 0	 iter: 1000	 train_loss: 3.838e-01	 train_accuracy:0.834
epoch: 0	 iter: 1100	 train_loss: 3.790e-01	 train_accuracy:0.836
epoch: 0	 iter: 1200	 train_loss: 3.751e-01	 train_accuracy:0.839
epoch: 0	 iter: 1300	 train_loss: 3.716e-01	 train_accuracy:0.841
epoch: 0	 train_loss: 3.693e-01	 train_accuracy:0.842	 val_l

KeyboardInterrupt: 