# Evaluate the BERTje/BERTweet models

This notebook evaluates the models (BERTje finetuned, BERTje + SVM, BERTweet finetuned and BERTweet + SVM) using the development dataset. It creates files that match the gold standard file which maps tweet id with the prediction. This file then can be used to get the evaluation results (using the `scoring_dalc.py` script) such as f1-score.

This notebook can be run locally or using a service like google colab.

It does require all the models to be saved as either `model.safetensor` ( + `config.json`) files for the finetuned models or python pickle files for the SVM's.

In [1]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.1
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
# Install the BertTweet Repo, needed for the tweet normalizer
from os import path
from os import path
if not path.exists('./BERTweet/'):
  !git clone https://github.com/VinAIResearch/BERTweet/
import sys
sys.path.append('/content/BERTweet')

Cloning into 'BERTweet'...
remote: Enumerating objects: 113, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 113 (delta 61), reused 13 (delta 6), pack-reused 0[K
Receiving objects: 100% (113/113), 46.29 KiB | 846.00 KiB/s, done.
Resolving deltas: 100% (61/61), done.


In [3]:
# Use BERTweet to normalize all tweets, so that they match the BERTweet style
from TweetNormalizer import normalizeTweet


def _normalize_tweet_bertweet(tweet):
  # In our data usernames are already normalized to @USER so this will not change anything
  # however URLS are already replaced to URL, but BERTweet uses HTTPURL instead of URL
  # so we have to replace these too, the rest is done by normalizeTweet from BERTweet
  tweet['normalized_text'] = normalizeTweet(tweet['text']).replace('URL', 'HTTPURL')
  return tweet


In [4]:
# Create the tokenizer
from transformers import AutoTokenizer

bt_tokenizer  = AutoTokenizer.from_pretrained('vinai/bertweet-base')
bertje_tokenizer = AutoTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
# Load and prepare the data
import pandas as pd

raw_data = pd.read_csv('dev_data_text.csv')

# Normalize the data for bertje and bertweet
data_norm_bt = raw_data.apply(_normalize_tweet_bertweet, axis=1)
data_norm_bertje = raw_data # TODO: does this need to be normalized?

X_bt = data_norm_bt['normalized_text'].tolist()
X_bertje = data_norm_bertje['text'].tolist()

In [6]:
# SKIP if not using google drive/colab
# Load the models from google drive

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/


Mounted at /content/gdrive
/content/gdrive/My Drive/Uni-23-24/Machine Learning Project/Final Assignment/models


In [11]:
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F

def batch_predictions(data, model, model_tokenizer, batch_size=8):

  n_samples = len(data)
  res = []

  model.eval()

  for i in tqdm(range(0, n_samples, batch_size)):

    # torch.no_grad() is only used as a performance improvement here
    with torch.no_grad():
      batch_data = data[i:i+batch_size]
      batch_tok = model_tokenizer(batch_data, padding=True, truncation=True, return_tensors='pt')
      batch_res = model(**batch_tok)

      # Turn the predictions into the classes
      probs = F.softmax(batch_res['logits'], dim=1)
      pred_classes = torch.argmax(probs, dim=1)

      res.extend([x.item() for x in pred_classes])

  return res

In [8]:
# Save output as specified by the gold file format
def save_predictions(predictions, path):
  df = pd.DataFrame({'id': [i+1 for i in range(len(predictions))], 'abusive_offensive_not': predictions})
  df.to_csv(path, index=False)

### Evaluate BERTweet fine-tuned on the DEV dataset

In [9]:
# Load the bertweet model
from transformers import AutoModelForSequenceClassification
bt_model_path = '/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bert_tweet_finetuned_1_model' # change to your local models path

bt_model = AutoModelForSequenceClassification.from_pretrained(bt_model_path, use_safetensors=True, num_labels=3)

In [12]:
# Make predictions in batches
bt_pred = batch_predictions(X_bt, bt_model, bt_tokenizer)
print(bt_pred[:10])

  0%|          | 0/151 [00:00<?, ?it/s]

[1, 0, 1, 0, 1, 0, 1, 2, 1, 1]


In [13]:
# Turn the classes into labels
bt_class_to_label = {0: "ABUSIVE", 1: "NOT", 2: "OFFENSIVE"}
bt_pred_labels = [bt_class_to_label[class_] for class_ in bt_pred]

In [14]:
# Save the predictions from bertweet finetuned

BT_PREDICTIONS_OUTPUT_FILE = '/content/bt_finetuned_dev_pred.csv'

save_predictions(bt_pred_labels, BT_PREDICTIONS_OUTPUT_FILE)

In [15]:
# Get the classification report
from scoring_dalc import compute_macro_f1

compute_macro_f1(BT_PREDICTIONS_OUTPUT_FILE, '/content/dev_data_abusive_offensive_gold.csv')

              precision    recall  f1-score   support

     ABUSIVE       0.48      0.68      0.56       170
         NOT       0.88      0.83      0.86       807
   OFFENSIVE       0.36      0.31      0.33       228

    accuracy                           0.71      1205
   macro avg       0.57      0.61      0.58      1205
weighted avg       0.72      0.71      0.72      1205



### Evaluate BERTje fine-tuned on the DEV dataset

In [17]:
# Load the bertje model
bertje_model_path = '/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_finetuned' # change to your local models path
bertje_model = AutoModelForSequenceClassification.from_pretrained(bertje_model_path, use_safetensors=True, num_labels=3)

In [18]:
# Make the predictions in batches
bertje_pred = batch_predictions(X_bertje, bertje_model, bertje_tokenizer)
print(bertje_pred[:10])

  0%|          | 0/151 [00:00<?, ?it/s]

[2, 2, 0, 0, 1, 2, 2, 2, 2, 1]


In [20]:
# Turn the classes into labels
bertje_class_to_label = {0: "ABUSIVE", 1: "NOT", 2: "OFFENSIVE"}
bertje_pred_labels = [bertje_class_to_label[class_] for class_ in bertje_pred]
print(bertje_pred_labels[:10])

['OFFENSIVE', 'OFFENSIVE', 'ABUSIVE', 'ABUSIVE', 'NOT', 'OFFENSIVE', 'OFFENSIVE', 'OFFENSIVE', 'OFFENSIVE', 'NOT']


In [21]:
# Save the predictions
BERTJE_PRED_OUTPUT_FILE = '/content/bertje_finetuned_dev_pred.csv'

save_predictions(bertje_pred_labels, BERTJE_PRED_OUTPUT_FILE)

In [22]:
# Get the classification report
from scoring_dalc import compute_macro_f1

compute_macro_f1(BERTJE_PRED_OUTPUT_FILE, '/content/dev_data_abusive_offensive_gold.csv')

              precision    recall  f1-score   support

     ABUSIVE       0.49      0.67      0.57       180
         NOT       0.90      0.86      0.88       800
   OFFENSIVE       0.45      0.39      0.42       225

    accuracy                           0.74      1205
   macro avg       0.61      0.64      0.62      1205
weighted avg       0.75      0.74      0.75      1205



### Evaluate BERTweet SVM on DEV dataset

In [23]:
import torch
import numpy as np


def batch_embeddings(data, model, model_tokenizer, batch_size=8):
  # Split your data into smaller batches
  num_samples = len(data)
  embeddings = []

  # Process data in batches
  for i in tqdm(range(0, num_samples, batch_size)):
      batch_texts = data[i:i+batch_size]
      batch_tokenized = model_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

      with torch.no_grad():
          batch_outputs = model(**batch_tokenized)
          batch_embeddings = batch_outputs.last_hidden_state.mean(dim=1).numpy()
          embeddings.append(batch_embeddings)

  # Concatenate embeddings from all batches
  embeddings = np.concatenate(embeddings, axis=0)

  return embeddings

In [24]:
# Get the embeddings from bertweet
from transformers import AutoModel

bt_embeddings_model = AutoModel.from_pretrained('vinai/bertweet-base')

X_bt_svm = batch_embeddings(X_bt, bt_embeddings_model, bt_tokenizer)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

  0%|          | 0/151 [00:00<?, ?it/s]

In [25]:
# Load the SVM model
import pickle

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/model.pkl', 'rb') as infile:
  bt_svm_model = pickle.load(infile)

In [26]:
# Create predictions
bt_svm_pred = bt_svm_model.predict(X_bt_svm)

In [27]:
# Save predictions
BT_SVM_OUTPUT_FILE = '/content/bt_svm_dev_pred.csv'
save_predictions(bt_svm_pred, BT_SVM_OUTPUT_FILE)

In [28]:
# Get the classification report
compute_macro_f1(BT_SVM_OUTPUT_FILE, '/content/dev_data_abusive_offensive_gold.csv')

              precision    recall  f1-score   support

     ABUSIVE       0.44      0.43      0.44       247
         NOT       0.80      0.80      0.80       772
   OFFENSIVE       0.24      0.25      0.25       186

    accuracy                           0.64      1205
   macro avg       0.49      0.49      0.49      1205
weighted avg       0.64      0.64      0.64      1205



### Evaluate BERTje SVM on DEV dataset

In [29]:
# Get the embeddings from bertweet
from transformers import AutoModel

bertje_embeddings_model = AutoModel.from_pretrained('GroNLP/bert-base-dutch-cased')

X_bertje_svm = batch_embeddings(X_bertje, bertje_embeddings_model, bertje_tokenizer)

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/151 [00:00<?, ?it/s]

In [30]:
# Load the SVM model
import pickle

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_svm/model.pkl', 'rb') as infile:
  bertje_svm_model = pickle.load(infile)

In [31]:
# Create predictions
bertje_svm_pred = bertje_svm_model.predict(X_bertje_svm)
print(bertje_svm_pred[:10])

['NOT' 'ABUSIVE' 'ABUSIVE' 'ABUSIVE' 'NOT' 'ABUSIVE' 'ABUSIVE' 'NOT'
 'OFFENSIVE' 'NOT']


In [32]:
# Save predictions
BERTJE_SVM_OUTPUT_FILE = '/content/bertje_svm_dev_pred.csv'
save_predictions(bertje_svm_pred, BERTJE_SVM_OUTPUT_FILE)

In [33]:
# Get the classification report
compute_macro_f1(BERTJE_SVM_OUTPUT_FILE, '/content/dev_data_abusive_offensive_gold.csv')

              precision    recall  f1-score   support

     ABUSIVE       0.52      0.51      0.52       246
         NOT       0.84      0.82      0.83       784
   OFFENSIVE       0.27      0.30      0.28       175

    accuracy                           0.68      1205
   macro avg       0.54      0.54      0.54      1205
weighted avg       0.69      0.68      0.69      1205

