In [34]:
!pip install torchmetrics



In [35]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision

In [36]:
FILL_IN = "FILL_IN"

In [37]:
# You can do this or just add the zip file I give you
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
!unzip -o smsspamcollection.zip

--2023-12-07 01:33:07--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.1’

smsspamcollection.z     [ <=>                ] 198.65K  --.-KB/s    in 0.1s    

2023-12-07 01:33:07 (1.37 MB/s) - ‘smsspamcollection.zip.1’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [38]:
!head -10 SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [39]:
# Loop through the data and make a dataframe with two columns
# label is 0/1 if Not Spam / Spam
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])

# Map 'ham' to 0 and 'spam' to 1 in the 'label' column
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display the first 10 rows of the DataFrame
print(df.head(10))

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
5      1  FreeMsg Hey there darling it's been 3 week's n...
6      0  Even my brother is not like to speak with me. ...
7      0  As per your request 'Melle Melle (Oru Minnamin...
8      1  WINNER!! As a valued network customer you have...
9      1  Had your mobile 11 months or more? U R entitle...


In [40]:
text = df.text.values
labels = df.label.values

In [41]:
# Get the tokenizer for BERT, using 'bert-based-uncased'
# Set do_lower_case = True

# Specify the BERT model name
model_name = 'bert-base-uncased'

# Create the tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [42]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text)-1)

    # Tokenize a random sentence's text and display the token / token id
    # You might need to use "tabulate" on a certain numpy array
    FILL_IN
    random_sentence = text[index]
    # Tokenize the random sentence
    tokens = tokenizer.tokenize(random_sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Display the token and token ID information using tabulate
    table = zip(tokens, token_ids)
    headers = ['Tokens', 'Token IDs']
    print(tabulate(table, headers=headers, tablefmt="fancy_grid"))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ mm       │        3461 │
├──────────┼─────────────┤
│ umm      │       26114 │
├──────────┼─────────────┤
│ ##a      │        2050 │
├──────────┼─────────────┤
│ ask      │        3198 │
├──────────┼─────────────┤
│ va       │       12436 │
├──────────┼─────────────┤
│ ##va     │        3567 │
├──────────┼─────────────┤
│ also     │        2036 │
├──────────┼─────────────┤
│ to       │        2000 │
├──────────┼─────────────┤
│ come     │        2272 │
├──────────┼─────────────┤
│ tell     │        2425 │
├──────────┼─────────────┤
│ him      │        2032 │
├──────────┼─────────────┤
│ can      │        2064 │
├──────────┼─────────────┤
│ play     │        2377 │
├──────────┼─────────────┤
│ later    │        2101 │
├──────────┼─────────────┤
│ together │        2362 │
╘══════════╧═════════════╛


In [43]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
    # Use the tokenizer to preprocess text
    # add_special_tokens = True, let the max_length = 32, pad_to_max_length = True, return_tensors = 'pt'
    # Look up tokenizer.encode_plus
  encoding_dict = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=32,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
  return encoding_dict

# For each sentence, loop and preprocess it
# Put the input ids in the token_id array
# Put the attention_masks in the attention_masks array

for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

# Concatenate all the elements of token_id into a tensor
token_id = torch.cat(token_id, dim=0)
# Concatenate all the elements of attention_masks
attention_masks = torch.cat(attention_masks, dim=0)
# Make a tensor out of the labels
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [44]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)
    # This should take a random row of token_id and display the tokens, token ids, and attention masks 1/0 values
    # You might need to use "tabulate" on a certain numpy array
    random_token_id = token_id[index]
    random_attention_mask = attention_masks[index]
    # Convert token IDs to tokens using the tokenizer's decode method
    tokens = tokenizer.decode(random_token_id.tolist(), skip_special_tokens=False)

    # Create a table using tabulate
    table_data = list(zip(tokens.split(), random_token_id.tolist(), random_attention_mask.tolist()))
    headers = ["Tokens", "Token IDs", "Attention Masks"]
    table = tabulate(table_data, headers, tablefmt="fancy_grid")

    # Display the table
    print(table)


print_rand_sentence_encoding()

╒══════════╤═════════════╤═══════════════════╕
│ Tokens   │   Token IDs │   Attention Masks │
╞══════════╪═════════════╪═══════════════════╡
│ [CLS]    │         101 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ i        │        1045 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ want     │        2215 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ to       │        2000 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ be       │        2022 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ there    │        2045 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ so       │        2061 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ i        │        1045 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ can      │        2064 │                 1 │
├──────────┼─────────────┼───────────────────┤
│ kiss     │ 

In [45]:
val_ratio = 0.2
# Pick a recommended batch size from https://arxiv.org/pdf/1810.04805.pdf
batch_size = 32

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    range(len(text)),
    test_size=val_ratio,
    stratify=labels,
    random_state=0
)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], attention_masks[train_idx], labels[train_idx])

val_set = TensorDataset(token_id[val_idx], attention_masks[val_idx], labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

### Load specific versions of the model

In [46]:
# Load the BertForSequenceClassification model
# Do not ouput the attentions and all hidden states
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, output_attentions=False, output_hidden_states=False)

# See for the optimizer and some learning rates: https://arxiv.org/pdf/1810.04805.pdf
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Set the model to the right device

In [47]:
# device = torch.device('mps') if (
#     torch.backends.mps.is_available() and torch.backends.mps.is_built()
# ) else torch.device('cpu')

# If on GPU, do as below
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
_ = model.to(device)

# Recommended number of epochs: See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

In [49]:
# Print all the layers of this BERT model and the number of parameters per layer
fine_tune_bert = True

total_parameters = 0
for name, param in model.named_parameters():
    if fine_tune_bert:
        print(name, param.shape)
        total_parameters += param.numel()

# Loop through all the parameters
# if fine_tune_bert is off, just fine tune the classifer head; otherwise, all parameters
# Also, print out all the parameter names and the number of elements for that parameter
# Finally, accumulate the total number of parameters
# You should get about 110 M

assert(total_parameters == 109483778)

bert.embeddings.word_embeddings.weight torch.Size([30522, 768])
bert.embeddings.position_embeddings.weight torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight torch.Size([2, 768])
bert.embeddings.LayerNorm.weight torch.Size([768])
bert.embeddings.LayerNorm.bias torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
bert.encoder

### Train the model

In [50]:
# Use torchmetrics to set up accuracy, recall, precision, and auroc
# Put these all on cpu as AUROC does not work on mps due to some bug
import torchmetrics

accuracy = torchmetrics.Accuracy(task='binary').cpu()
recall = torchmetrics.Recall(task='binary').cpu()
precision = torchmetrics.Precision(task='binary').cpu()
auroc = torchmetrics.AUROC(task='binary').cpu()

In [57]:
# Main training / validation loop
for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        # map each element of the batch to device
        # Optimize over the batch
        # Do the forward call, etc
        # Remember to zero out the gradients as necessary

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()

        # Forward pass
        train_output = model(b_input_ids.to(device), attention_mask=b_input_mask.to(device), labels=b_labels)

        # Backward pass
        loss = train_output.loss
        loss.backward()

        # Anything else
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids.to(device), token_type_ids=None, attention_mask=b_input_mask.to(device), labels = b_labels)

        # Calculate validation metrics
        labels = b_labels.to('cpu')
        predicted_labels = torch.argmax(eval_output.logits, dim=1).to('cpu')

        val_accuracy.append(accuracy(predicted_labels, labels))
        val_recall.append(recall(predicted_labels, labels))
        val_precision.append(precision(predicted_labels, labels))
        val_auroc.append(auroc(predicted_labels, labels))

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

Epoch:  50%|█████     | 1/2 [00:09<00:09,  9.53s/it]


	 - Train loss: 0.0025
	 - Validation Accuracy: 0.9911
	 - Validation Precision: 0.9514
	 - Validation Recall: 0.9181
	 - Validation AUROC: 0.9438



Epoch: 100%|██████████| 2/2 [00:19<00:00,  9.52s/it]


	 - Train loss: 0.0019
	 - Validation Accuracy: 0.9884
	 - Validation Precision: 0.9143
	 - Validation Recall: 0.9312
	 - Validation AUROC: 0.9479






### Test on a specific sentence, see the outcome

In [56]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Predicted Class:  Spam


### Questions

Question 1: Run the above by fine tuning bert and the classfier head and by not doing this (using BERT as a feature encoder). What is the gap between this?

Solution:

If we don't tune the BERT part of the model, that is, we use BERT as a feature encoder and only tune the classification layer, the model performance drops.

One the validation set:

- If we tune BERT, Train loss: 0.0019. If not, Train loss: 0.6829.
- If we tune BERT, Accuracy: 0.9884. If not, Accuracy: 0.8654.
- If we tune BERT, Precision: 0.9143. If not, Precision: 0.
- If we tune BERT, Recall: 0.9312. If not, Recall: 0.
- If we tune BERT, AUROC: 0.9479. If not, AUROC: 0.4852.

The AUROC drops almost half. If we do not tune BERT and only tune the classification layer, the model is not complex enough to capture the general pattern in the data. The precision and recall are nearly zero. So, the model may resort to always predict the mojority class.

In [70]:
model2 = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, output_attentions=False, output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
# Step 2: Modify the last classification layer for binary classification (spam or not spam)
num_classes = 2
# model2.classifier = torch.nn.Sequential(
#     torch.nn.Linear(model.config.hidden_size, num_classes),
#     torch.nn.ReLU()
# )
# Freeze the BERT layers
for param in model2.bert.parameters():
    param.requires_grad = False

# Set device
model2.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [72]:
# Main training / validation loop
for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model2.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        # map each element of the batch to device
        # Optimize over the batch
        # Do the forward call, etc
        # Remember to zero out the gradients as necessary

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()

        # Forward pass
        train_output = model2(b_input_ids.to(device), attention_mask=b_input_mask.to(device), labels=b_labels)

        # Backward pass
        loss = train_output.loss
        loss.backward()

        # Anything else
        optimizer.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model2.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_auroc = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model2(b_input_ids.to(device), token_type_ids=None, attention_mask=b_input_mask.to(device), labels = b_labels)

        # Calculate validation metrics
        labels = b_labels.to('cpu')
        predicted_labels = torch.argmax(eval_output.logits, dim=1).to('cpu')

        val_accuracy.append(accuracy(predicted_labels, labels))
        val_recall.append(recall(predicted_labels, labels))
        val_precision.append(precision(predicted_labels, labels))
        val_auroc.append(auroc(predicted_labels, labels))

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)))
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))
    print('\t - Validation AUROC: {:.4f}\n'.format(sum(val_auroc)/len(val_auroc)))

Epoch:  50%|█████     | 1/2 [00:03<00:03,  3.32s/it]


	 - Train loss: 0.4443
	 - Validation Accuracy: 0.8654
	 - Validation Precision: 0.0000
	 - Validation Recall: 0.0000
	 - Validation AUROC: 0.4852



Epoch: 100%|██████████| 2/2 [00:06<00:00,  3.30s/it]


	 - Train loss: 0.4432
	 - Validation Accuracy: 0.8654
	 - Validation Precision: 0.0000
	 - Validation Recall: 0.0000
	 - Validation AUROC: 0.4852




