# Create predictions using the BERTje finetuned model

This notebook loads our BERTje finetuned model and creates predictions.

In [None]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.1
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
# Create the tokenizer
from transformers import AutoTokenizer

bertje_tokenizer = AutoTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Load and prepare the data
import pandas as pd

raw_data = pd.read_csv('test_data_text_final.csv')

# Normalize the data for bertje
data_norm_bertje = raw_data

X_bertje = data_norm_bertje['text'].tolist()

In [None]:
# SKIP if not using google drive/colab
# Load the models from google drive

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/


Mounted at /content/gdrive
/content/gdrive/My Drive/Uni-23-24/Machine Learning Project/Final Assignment/models


In [None]:
import numpy as np
from tqdm import tqdm
import torch
import torch.nn.functional as F

def batch_predictions(data, model, model_tokenizer, batch_size=8):

  n_samples = len(data)
  res = []

  model.eval()

  for i in tqdm(range(0, n_samples, batch_size)):

    # torch.no_grad() is only used as a performance improvement here
    with torch.no_grad():
      batch_data = data[i:i+batch_size]
      batch_tok = model_tokenizer(batch_data, padding=True, truncation=True, return_tensors='pt')
      batch_res = model(**batch_tok)

      # Turn the predictions into the classes
      probs = F.softmax(batch_res['logits'], dim=1)
      pred_classes = torch.argmax(probs, dim=1)

      res.extend([x.item() for x in pred_classes])

  return res

In [None]:
# Save output as specified by the gold file format
def save_predictions(predictions, path):
  df = pd.DataFrame({'id': [i+1 for i in range(len(predictions))], 'abusive_offensive_not': predictions})
  df.to_csv(path, index=False)

### Evaluate BERTje fine-tuned on the DEV dataset

In [None]:
from transformers import AutoModelForSequenceClassification
# Load the bertje model
bertje_model_path = '/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_finetuned' # change to your local models path
bertje_model = AutoModelForSequenceClassification.from_pretrained(bertje_model_path, use_safetensors=True, num_labels=3)

In [None]:
# Make the predictions in batches
bertje_pred = batch_predictions(X_bertje, bertje_model, bertje_tokenizer)
print(bertje_pred[:10])

100%|██████████| 409/409 [02:13<00:00,  3.05it/s]

[1, 1, 1, 1, 1, 2, 1, 1, 1, 1]





In [None]:
# Turn the classes into labels
bertje_class_to_label = {0: "ABUSIVE", 1: "NOT", 2: "OFFENSIVE"}
bertje_pred_labels = [bertje_class_to_label[class_] for class_ in bertje_pred]
print(bertje_pred_labels[:10])

['NOT', 'NOT', 'NOT', 'NOT', 'NOT', 'OFFENSIVE', 'NOT', 'NOT', 'NOT', 'NOT']


In [None]:
# Save the predictions
save_predictions(bertje_pred_labels, '/content/bertje_finetuned_final_pred.csv')