# Train a SVM on word embeddings generated by BERTje

This notebook trains an Support Vector Machine on the word embeddings generated by the BERTje model. These embeddings are used as the input features for the SVM which classifies wether tweets are abusive, offensive or not.

The notebook can be ran locally or on a service like google colab.

In [None]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.1
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
# Load the data
import pandas as pd

dataset = pd.read_csv("train_data_offensive_abusive_taskC.csv")

# Split data 80 / 20
dataset_len = len(dataset)

# Shuffle the DataFrame to ensure randomness
dataset = dataset.sample(frac=1, random_state=42)

# Split the data in train and test
train = dataset.tail(int(dataset_len*0.8))
train_X = train['text'].tolist()
train_y = train['abusive_offensive_not'].tolist()

test = dataset.head(int(dataset_len*0.2))
test_X = test['text'].tolist()
test_y = test['abusive_offensive_not'].tolist()

In [None]:
import torch
import numpy as np
from tqdm import tqdm

# Get the embeddings from BERTje
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")

def get_embeddings(data):
  # Split your data into smaller batches
  batch_size = 8
  num_samples = len(data)
  embeddings = []

  # Process data in batches
  for i in tqdm(range(0, num_samples, batch_size)):
      batch_texts = data[i:i+batch_size]
      batch_tokenized = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

      with torch.no_grad():
          batch_outputs = model(**batch_tokenized)
          batch_embeddings = batch_outputs.last_hidden_state.mean(dim=1).numpy()
          embeddings.append(batch_embeddings)

  # Concatenate embeddings from all batches
  embeddings = np.concatenate(embeddings, axis=0)

  return embeddings


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
embeddings_train = get_embeddings(train_X)

100%|██████████| 682/682 [05:37<00:00,  2.02it/s]


In [None]:
embeddings_test = get_embeddings(test_X)

100%|██████████| 171/171 [01:27<00:00,  1.96it/s]


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pickle

def save_embeddings(path, data):
  # Save the embeddings using pickle
  with open(path, 'wb') as f:
      pickle.dump(data, f)

  print("Embeddings saved successfully.")

save_embeddings("/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_svm/bertje_embeddings_train.pkl", embeddings_train)
save_embeddings("/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_svm/bertje_embeddings_test.pkl", embeddings_test)

Embeddings saved successfully.
Embeddings saved successfully.


In [None]:
from sklearn.svm import SVC

# The SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(embeddings_train, train_y)

In [None]:
from sklearn.metrics import f1_score

predicted = svm_classifier.predict(embeddings_test)

f1score = f1_score(test_y, predicted, average='macro')

print(f1score)

0.5561369727148254


In [None]:
# Save the sklearn SVM model
import pickle

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertje_svm/model.pkl', 'wb') as outfile:
  pickle.dump(svm_classifier, outfile)