# BERTje finetuned on DALC dataset

This notebook is meant to finetune the BERTje model on the DALC dataset to create a model that can classify wether (dutch) tweets are abusive, offensive or neither.

The notebook can be run locally or on a service like Google Colab.

In [None]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.1
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
# Install BERTje Repo, needed for tweet normalizer
from os import path
if not path.exists('./BERTje/'):
  !git clone https://github.com/wietsedv/bertje
import sys
sys.path.append('/content/BERTje')

Cloning into 'bertje'...
remote: Enumerating objects: 190, done.[K
remote: Counting objects: 100% (190/190), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 190 (delta 53), reused 105 (delta 11), pack-reused 0[K
Receiving objects: 100% (190/190), 269.05 KiB | 8.41 MiB/s, done.
Resolving deltas: 100% (53/53), done.


In [None]:
# Create the tokenizer
from transformers import AutoTokenizer

tokenizer  = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

def tokenize(tweet):
  # The normalized tweet is tokenized, padded and truncated based on how the
  # BERTje model is trained aswell.
  return tokenizer(tweet['text'], padding='max_length', truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Create function to run after every epoch to calculate the metrics
def compute_metrics(p):
  pred, labels = p
  pred = np.argmax(pred, axis=1)

  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true=labels, y_pred=pred, average="macro")
  precision = precision_score(y_true=labels, y_pred=pred, average="macro")
  f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Create function to transform the labels to numbers
def transform_labels(tweet, encoder):
  l = encoder.transform([tweet['abusive_offensive_not']])
  tweet['label'] = l[0]
  return tweet

In [None]:
# Prepare the dataset
from datasets import load_dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder

# Set seed to 42 for replicability
SEED = 42

# Load the dataset
dataset = load_dataset('csv', data_files='train_data_offensive_abusive_taskC.csv')

# Remove unsused columns
# dataset.remove_columns(['id'])

# Tokenize and normalize the dataset
dataset = dataset.map(tokenize)

# Encode the labels
le = LabelEncoder()
le.fit(dataset['train']['abusive_offensive_not'])

print(le.classes_)
print(le.inverse_transform([0, 1, 2]))


dataset = dataset.map(lambda x: transform_labels(x, le))

# Remove unused columns
dataset = dataset.remove_columns(['id', 'text', 'abusive_offensive_not'])

# Show example of how the data looks like now
print(dataset)

# Split into train, validation and test sets
train_test_ds= dataset['train'].train_test_split(test_size=0.2, seed=SEED)
train_valid_ds = train_test_ds['train'].train_test_split(test_size=0.2, seed=SEED)

# Create a new dataset with the split data
ds = DatasetDict({
  'train': train_valid_ds['train'],
  'validation': train_valid_ds['test'],
  'test': train_test_ds['test']
})

# Show the shape of the dataset
print(ds)

# Show some examples
print(ds['train'][0:5])

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6817 [00:00<?, ? examples/s]

['ABUSIVE' 'NOT' 'OFFENSIVE']
['ABUSIVE' 'NOT' 'OFFENSIVE']


Map:   0%|          | 0/6817 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 6817
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 4362
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1091
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1364
    })
})
{'input_ids': [[1, 0, 7127, 23317, 2123, 15090, 131, 21416, 16119, 10532, 5108, 18442, 23917, 13261, 10132, 24397, 20347, 11281, 10476, 10393, 13644, 28693, 13458, 132, 11, 9837, 25108, 12181, 13604, 16058, 10532, 5108, 15638, 10391, 13903, 10516, 3654, 11, 13277, 13132, 10532, 8399, 8, 16038, 8, 13, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [None]:
# Get and shuffle train, validation and test data
train_data = ds['train'].shuffle(seed=SEED)
validation_data = ds['validation'].shuffle(seed=SEED)
test_data = ds['test'].shuffle(seed=SEED)

In [None]:
# Model
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=3)

training_args = TrainingArguments(
    output_dir = 'test_trainer_1',
    logging_dir = 'logs',
    evaluation_strategy="epoch",
    learning_rate=0.00005,
    per_device_train_batch_size=16, # changed to 32 because of memory issues
    per_device_eval_batch_size=16,
    weight_decay=0,
    num_train_epochs=10
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer
import torch, gc
import os
gc.collect()
torch.cuda.empty_cache()


trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = validation_data,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.663963,0.747021,0.631176,0.565908,0.582059
2,0.549900,0.821339,0.738772,0.650783,0.553801,0.577196
3,0.549900,0.989229,0.716774,0.606021,0.625548,0.61337
4,0.192600,1.366849,0.726856,0.628682,0.620198,0.618796
5,0.192600,1.565971,0.737855,0.611276,0.59669,0.602429
6,0.065700,1.693851,0.741522,0.632534,0.601688,0.613627
7,0.065700,1.705678,0.749771,0.638692,0.630192,0.634068
8,0.026100,1.74797,0.744271,0.632973,0.614531,0.622552
9,0.026100,1.784793,0.742438,0.633683,0.611386,0.620743
10,0.019300,1.817853,0.738772,0.632144,0.606608,0.616193


TrainOutput(global_step=2730, training_loss=0.15747094442556192, metrics={'train_runtime': 1410.1372, 'train_samples_per_second': 30.933, 'train_steps_per_second': 1.936, 'total_flos': 1.147700728129536e+16, 'train_loss': 0.15747094442556192, 'epoch': 10.0})

In [None]:
model.eval()
x = trainer.predict(test_data)
x

PredictionOutput(predictions=array([[-3.8626356 ,  6.893009  , -3.876535  ],
       [-3.9970782 ,  6.6821146 , -3.4193232 ],
       [ 5.919577  , -2.8035722 , -3.3443756 ],
       ...,
       [ 3.8499227 , -4.411129  ,  0.7895871 ],
       [-4.563892  ,  0.43073633,  4.331287  ],
       [ 0.47292158,  1.579824  , -2.5964873 ]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 1, 1]), metrics={'test_loss': 1.5934430360794067, 'test_accuracy': 0.7727272727272727, 'test_precision': 0.6579035951403377, 'test_recall': 0.6344914582486131, 'test_f1': 0.644524911797626, 'test_runtime': 14.021, 'test_samples_per_second': 97.283, 'test_steps_per_second': 6.134})

In [None]:
# Save the model

from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/bertje

# trainer.save_pretrained("bert_tweet_finetuned_0")
model.save_pretrained("bertje_finetuned")

trainer.save_model("gdrive/My Drive/Uni-23-24/Machine\ Learning\ Project/Final\ Assignment/models/bertje")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje
