# Necessary Imports


In [None]:
%pip install transformers --quiet
%pip install sentencepiece --quiet  # required dependency for AfriBERTA model

In [None]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score

# Set device to GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Setup data

In [None]:
%git clone https://github.com/afrisenti-semeval/afrisent-semeval-2023.git

fatal: destination path 'afrisent-semeval-2023' already exists and is not an empty directory.


# Custom Dataset class


In [None]:
class TweetDataset(Dataset):
  def __init__(self, file_path, tokenizer):
    self.tokenizer = tokenizer
    self.data = pd.read_csv(file_path, sep='\t')

    # transform the labels into binary integers (0 or 1), using custom-defined function
    self.data["label"] = self.data["label"].apply(lambda x: self._convert_label(x))


  def _convert_label(self, sentiment):
    return 0 if sentiment == "negative" else 1

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = str(self.data.loc[idx, 'tweet'])
    label = self.data.loc[idx, 'label']

    # Tokenize the tweet
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      truncation=True,
      max_length=512,    # change to 512
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }



In [None]:
# set random seed for reproducability
seed_val = 42
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7f5391275cd0>

# Data Loading

In [None]:
PRETRAINED_MODEL = "castorini/afriberta_large"

train_data_path = 'afrisent-semeval-2023/data/ibo/train.tsv'
test_data_path = 'afrisent-semeval-2023/data/ibo/test.tsv'
val_data_path = 'afrisent-semeval-2023/data/ibo/dev.tsv'

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)



In [None]:
# set up dataset and dataloader
train_data = TweetDataset(train_data_path, tokenizer)
test_data = TweetDataset(test_data_path, tokenizer)
val_data = TweetDataset(val_data_path, tokenizer)

In [None]:
# define data loaders
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

# Defining the Model

In [None]:
# initialize the model
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
model.to(device)

Some weights of the model checkpoint at castorini/afriberta_large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at castorini/afriberta_large and are newly initialized: ['classifier.out_proj.weight', 'clas

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(70006, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-9): 10 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, b

In [None]:
# set optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
criterion = torch.nn.BCEWithLogitsLoss()    # alternated between CELoss and BCEWithLogitsLoss



In [None]:
# train the model
epochs = 5
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    # set model to training mode
    model.train()

    train_loss, train_acc = 0, 0
    total_train_preds = 0
    total_train_correct_preds = 0
    total_val_preds_list = []
    total_val_correct_preds_list = []


    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = outputs.loss    # criterion(logits, labels)
        loss.backward()
        optimizer.step()


        train_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        total_train_preds += labels.size(0)
        total_train_correct_preds += torch.sum(preds == labels)


    train_loss /= len(train_dataloader)
    train_acc = total_train_correct_preds.double() / total_train_preds
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}")


    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []

    eval_loss = 0
    eval_acc = 0
    total_val_preds = 0
    total_val_correct_preds = 0

    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss     # criterion(outputs, labels)
            logits = outputs.logits

            eval_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            total_val_preds += labels.size(0)
            total_val_correct_preds += torch.sum(preds == labels)


            predictions.extend(preds.cpu().numpy())
            true_labels .extend(labels.cpu().numpy())

    eval_loss /= len(val_dataloader)
    eval_acc = total_val_correct_preds.double() / total_val_preds

    # calculate evaluation metrics
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
    print(f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Eval Loss: {eval_loss:.3f}, Eval Acc: {eval_acc:.3f}")



Epoch 1/5
----------
Epoch 1/5, Train Loss: 0.390, Train Acc: 0.830
Precision: 0.908, Recall: 0.919, F1: 0.914
Epoch 1/5: Train Loss: 0.390, Train Acc: 0.830, Eval Loss: 0.310, Eval Acc: 0.871
Epoch 2/5
----------




Epoch 2/5, Train Loss: 0.221, Train Acc: 0.912
Precision: 0.910, Recall: 0.933, F1: 0.921
Epoch 2/5: Train Loss: 0.221, Train Acc: 0.912, Eval Loss: 0.301, Eval Acc: 0.881
Epoch 3/5
----------




Epoch 3/5, Train Loss: 0.110, Train Acc: 0.961
Precision: 0.917, Recall: 0.924, F1: 0.921
Epoch 3/5: Train Loss: 0.110, Train Acc: 0.961, Eval Loss: 0.394, Eval Acc: 0.882
Epoch 4/5
----------




Epoch 4/5, Train Loss: 0.058, Train Acc: 0.982
Precision: 0.918, Recall: 0.922, F1: 0.920
Epoch 4/5: Train Loss: 0.058, Train Acc: 0.982, Eval Loss: 0.491, Eval Acc: 0.880
Epoch 5/5
----------




Epoch 5/5, Train Loss: 0.029, Train Acc: 0.991
Precision: 0.914, Recall: 0.929, F1: 0.921
Epoch 5/5: Train Loss: 0.029, Train Acc: 0.991, Eval Loss: 0.619, Eval Acc: 0.882
