# Train a SVM on word embeddings generated by BERTweet

This notebook trains an Support Vector Machine on the word embeddings generated by the BERTweet model. These embeddings are used as the input features for the SVM which classifies wether tweets are abusive, offensive or not.

The notebook can be ran locally or on a service like google colab.

In [None]:
# Install dependencies
!pip uninstall accelerate transformers -y
!pip install -U accelerate>=0.21.0
!pip3 install transformers
!pip3 install datasets
!pip3 install pandas
!pip3 install torch
!pip3 install scikit-learn
!pip3 install numpy
!pip3 install nltk emoji==0.6.0

[0mFound existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.39.1
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
# Install the BertTweet Repo, needed for the tweet normalizer
from os import path
from os import path
if not path.exists('./BERTweet/'):
  !git clone https://github.com/VinAIResearch/BERTweet/
import sys
sys.path.append('/content/BERTweet')

In [None]:
# Use BERTweet to normalize all tweets, so that they match the BERTweet style
from TweetNormalizer import normalizeTweet

def normalize_tweet(tweet):
  # In our data usernames are already normalized to @USER so this will not change anything
  # however URLS are already replaced to URL, but BERTweet uses HTTPURL instead of URL
  # so we have to replace these too, the rest is done by normalizeTweet from BERTweet
  tweet['normalized_text'] = normalizeTweet(tweet['text']).replace('URL', 'HTTPURL')
  return tweet

  # return normalizeTweet(tweet_text).replace('URL', 'HTTPURL')

In [None]:
# Create the tokenizer
from transformers import AutoTokenizer

tokenizer  = AutoTokenizer.from_pretrained('vinai/bertweet-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

In [None]:
# Load the data
import pandas as pd

dataset = pd.read_csv("train_data_offensive_abusive_taskC.csv")

# Split data 80 / 20
dataset_len = len(dataset)

# Shuffle the DataFrame to ensure randomness
dataset = dataset.sample(frac=1, random_state=42)

# Normalize all the tweets, the normalized tweet text
# is saved to the `normalized_text` column
dataset = dataset.apply(normalize_tweet, axis=1)

train = dataset.tail(int(dataset_len*0.8))
train_X = train['normalized_text'].tolist()
train_y = train['abusive_offensive_not'].tolist()

test = dataset.head(int(dataset_len*0.2))
test_X = test['normalized_text'].tolist()
test_y = test['abusive_offensive_not'].tolist()


# Tokenize the data
train_X_tokenized = tokenizer(train_X, padding='max_length', truncation=True, return_tensors='pt')
test_X_tokenized = tokenizer(test_X, padding='max_length', truncation=True, return_tensors='pt')

In [None]:
# Get the embeddings from bertweet
import torch
from transformers import AutoModel
model = AutoModel.from_pretrained('vinai/bertweet-base')

with torch.no_grad():
    train_outputs = model(**train_X_tokenized)

train_X_embeddings = train_outputs.last_hidden_state.mean(dim=1).numpy()

In [None]:
# Optional: save the embeddings so we can load them later on
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/embeddings/train_X_v1_embeddings.npy', 'wb') as outfile:
  np.save(outfile, train_X_embeddings)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Load the embeddings (which means the previous cells can be skipped)
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

train_X_embeddings = np.load('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/embeddings/train_X_v1_embeddings.npy')

Mounted at /content/gdrive


In [None]:
# Train the SVM classifier
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(train_X_embeddings, train_y)

In [None]:
# Get the embeddings for the test data
with torch.no_grad():
    test_outputs = model(**test_X_tokenized)

test_X_embeddings = test_outputs.last_hidden_state.mean(dim=1).numpy()

In [None]:
# Optional: save the embeddings so we can load them later on
# Note: the google colab code can be removed when running locally
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/embeddings/test_X_v1_embeddings.npy', 'wb') as outfile:
  np.save(outfile, test_X_embeddings)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Load the embeddings (skip prev. 2 cells)

test_X_embeddings = np.load('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/embeddings/test_X_v1_embeddings.npy')

In [None]:
from sklearn.metrics import f1_score

predictions = svm_model.predict(test_X_embeddings)
f1 = f1_score(test_y, predictions, average='macro')
print("F1 Score:", f1)

F1 Score: 0.4784933927431371


In [None]:
# Save the sklearn SVM model
import pickle

with open('/content/gdrive/MyDrive/Uni-23-24/Machine Learning Project/Final Assignment/models/bertweet_svm/model.pkl', 'wb') as outfile:
  pickle.dump(svm_model, outfile)