In this notebook I use the HuggingFace library to experiment with CANINE and BERT on hate speech detection task, to check the preformances of CANINE against noisy datasets.

# Installs/Imports

In [None]:
! pip install transformers datasets

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import get_scheduler
from datasets import load_dataset, load_metric

# Dataset

In [None]:
dataset = load_dataset("hate_speech_offensive")

def tokenize_function(tokenizer, input_field):
    return lambda examples: tokenizer(examples[input_field], padding="max_length", truncation=True)

# CANINE

In [None]:

from transformers import CanineTokenizer, CanineForSequenceClassification

canine_tokenizer = CanineTokenizer.from_pretrained("google/canine-s")
canine = CanineForSequenceClassification.from_pretrained("google/canine-s", num_labels=3)
"""
for name, param in canine.named_parameters():
    if not('classifier' in name):
        param.requires_grad = False
"""
inputs = canine_tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = canine(**inputs, labels=labels)
print(outputs)
loss = outputs.loss
logits = outputs.logits


# BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

"""
for name, param in bert.named_parameters():
    if not('classifier' in name):
        param.requires_grad = False
"""
inputs = bert_tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = bert(**inputs, labels=labels)
print(outputs)
loss = outputs.loss
logits = outputs.logits


# Training

In [None]:
# Select model
model = canine
tokenizer = canine_tokenizer

# Parameters
learning_rate = 3e-4
n_epochs = 20
batch_size = 4
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Dataset parameter
dataset_input_field = "tweet"
dataset_label_field = "class"
dataset_remove_columns = ['count','hate_speech_count', 'offensive_language_count', 'neither_count', 'tweet']
# Tokenizer wraggling
tokenized_datasets = dataset.map(tokenize_function(tokenizer, dataset_input_field), batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(dataset_remove_columns)
tokenized_datasets = tokenized_datasets.rename_column(dataset_label_field, "labels")
tokenized_datasets.set_format("torch")

tokenized_datasets = tokenized_datasets['train'].train_test_split(0.1) # Remove if already split into train/test


# Loaders
train_dataloader = torch.utils.data.DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(tokenized_datasets["test"], batch_size=batch_size)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
n_training_steps = n_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=n_training_steps)
acc = load_metric("accuracy")


In [None]:
# Remove wandb dependency if not needed
import wandb
wandb.init(project="nlpmva")
wandb.watch(model)

for k in range(n_epochs):
    metrics = {}
    # Train
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        # Compute metrics
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=batch["labels"])
    metrics['train_acc'] = acc.compute()
    
    # Test
    model.eval()
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        # Compute metrics
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=batch["labels"])
    metrics['test_acc'] = acc.compute()
    
    # log metrics
    wandb.log(metrics)