# Necessary Imports


In [1]:
%pip install transformers --quiet
%pip install sentencepiece --quiet  # required dependency for AfriBERTA model

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score

# Set device to GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Setup data

In [3]:
%git clone https://github.com/afrisenti-semeval/afrisent-semeval-2023.git

Cloning into 'afrisent-semeval-2023'...
remote: Enumerating objects: 890, done.[K
remote: Counting objects: 100% (319/319), done.[K
remote: Compressing objects: 100% (177/177), done.[K
remote: Total 890 (delta 162), reused 278 (delta 133), pack-reused 571[K
Receiving objects: 100% (890/890), 24.04 MiB | 13.34 MiB/s, done.
Resolving deltas: 100% (426/426), done.


# Custom Dataset class


In [4]:
class TweetDataset(Dataset):
  def __init__(self, file_path, tokenizer):
    self.tokenizer = tokenizer
    self.data = pd.read_csv(file_path, sep='\t')

    # transform the labels into binary integers (0 or 1), using custom-defined function
    self.data["label"] = self.data["label"].apply(lambda x: self._convert_label(x))


  def _convert_label(self, sentiment):
    return 0 if sentiment == "negative" else 1

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = str(self.data.loc[idx, 'tweet'])
    label = self.data.loc[idx, 'label']

    # Tokenize the tweet
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      truncation=True,
      max_length=512,    # change to 512
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }



In [5]:
# set random seed for reproducability
seed_val = 42
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7f47578bdd70>

# Data Loading

In [6]:
PRETRAINED_MODEL = "castorini/afriberta_large"

train_data_path = 'afrisent-semeval-2023/data/hau/train.tsv'
test_data_path = 'afrisent-semeval-2023/data/hau/test.tsv'
val_data_path = 'afrisent-semeval-2023/data/hau/dev.tsv'

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [7]:
# set up dataset and dataloader
train_data = TweetDataset(train_data_path, tokenizer)
test_data = TweetDataset(test_data_path, tokenizer)
val_data = TweetDataset(val_data_path, tokenizer)

In [8]:
# define data loaders
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

###Defining the accuracy function

# Defining the Model

In [9]:
# initialize the model
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

Some weights of the model checkpoint at castorini/afriberta_large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at castorini/afriberta_large and are newly initialized: ['classifier.dense.bias', 'classifie

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(70006, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-9): 10 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, b

In [10]:
# set optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
criterion = torch.nn.BCEWithLogitsLoss()    # alternated between CELoss and BCEWithLogitsLoss



In [11]:
# train the model
epochs = 5
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    # set model to training mode
    model.train()

    train_loss, train_acc = 0, 0
    total_train_preds = 0
    total_train_correct_preds = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = outputs.loss    # criterion(logits, labels)
        loss.backward()
        optimizer.step()


        train_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        total_train_preds += labels.size(0)
        total_train_correct_preds += torch.sum(preds == labels)


    train_loss /= len(train_dataloader)
    train_acc = total_train_correct_preds.double() / total_train_preds
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}")


    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []

    eval_loss = 0
    eval_acc = 0
    total_val_preds = 0
    total_val_correct_preds = 0
    

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss     # criterion(outputs, labels)
            logits = outputs.logits

            eval_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            total_val_preds += labels.size(0)
            total_val_correct_preds += torch.sum(preds == labels)

            predictions.extend(preds.cpu().numpy())
            true_labels .extend(labels.cpu().numpy())

            
    eval_loss /= len(val_dataloader)
    eval_acc = total_val_correct_preds.double() / total_val_preds

    # calculate evaluation metrics
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
    print(f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Eval Loss: {eval_loss:.3f}, Eval Acc: {eval_acc:.3f}")



Epoch 1/5
----------
Epoch 1/5, Train Loss: 0.418, Train Acc: 0.806
Precision: 0.840, Recall: 0.950, F1: 0.891
Epoch 1/5: Train Loss: 0.418, Train Acc: 0.806, Eval Loss: 0.353, Eval Acc: 0.846
Epoch 2/5
----------




Epoch 2/5, Train Loss: 0.251, Train Acc: 0.900
Precision: 0.854, Recall: 0.933, F1: 0.892
Epoch 2/5: Train Loss: 0.251, Train Acc: 0.900, Eval Loss: 0.382, Eval Acc: 0.849
Epoch 3/5
----------




Epoch 3/5, Train Loss: 0.130, Train Acc: 0.953
Precision: 0.879, Recall: 0.893, F1: 0.886
Epoch 3/5: Train Loss: 0.130, Train Acc: 0.953, Eval Loss: 0.487, Eval Acc: 0.847
Epoch 4/5
----------




Epoch 4/5, Train Loss: 0.059, Train Acc: 0.980
Precision: 0.904, Recall: 0.834, F1: 0.868
Epoch 4/5: Train Loss: 0.059, Train Acc: 0.980, Eval Loss: 0.653, Eval Acc: 0.830
Epoch 5/5
----------




Epoch 5/5, Train Loss: 0.035, Train Acc: 0.989
Precision: 0.894, Recall: 0.869, F1: 0.881
Epoch 5/5: Train Loss: 0.035, Train Acc: 0.989, Eval Loss: 0.740, Eval Acc: 0.844
