# Necessary Imports


In [1]:
%pip install transformers --quiet
%pip install sentencepiece --quiet  # required dependency for AfriBERTA model

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score

# Set device to GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Setup data

In [3]:
%git clone https://github.com/afrisenti-semeval/afrisent-semeval-2023.git

Cloning into 'afrisent-semeval-2023'...
remote: Enumerating objects: 890, done.[K
remote: Counting objects: 100% (327/327), done.[K
remote: Compressing objects: 100% (178/178), done.[K
remote: Total 890 (delta 170), reused 285 (delta 140), pack-reused 563[K
Receiving objects: 100% (890/890), 24.04 MiB | 19.92 MiB/s, done.
Resolving deltas: 100% (426/426), done.


# Custom Dataset class


In [4]:
class TweetDataset(Dataset):
  def __init__(self, file_path, tokenizer):
    self.tokenizer = tokenizer
    self.data = pd.read_csv(file_path, sep='\t')

    # transform the labels into binary integers (0 or 1), using custom-defined function
    self.data["label"] = self.data["label"].apply(lambda x: self._convert_label(x))


  def _convert_label(self, sentiment):
    return 0 if sentiment == "negative" else 1

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = str(self.data.loc[idx, 'tweet'])
    label = self.data.loc[idx, 'label']

    # Tokenize the tweet
    encoding = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens=True,
      truncation=True,
      max_length=512,    # change to 512
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt'
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(label, dtype=torch.long)
    }



In [5]:
# set random seed for reproducability
seed_val = 42
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7f8ecb866d70>

# Data Loading

In [6]:
PRETRAINED_MODEL = "castorini/afriberta_large"

train_data_path = 'afrisent-semeval-2023/data/pcm/train.tsv'
test_data_path = 'afrisent-semeval-2023/data/pcm/test.tsv'
val_data_path = 'afrisent-semeval-2023/data/pcm/dev.tsv'

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [7]:
# set up dataset and dataloader
train_data = TweetDataset(train_data_path, tokenizer)
test_data = TweetDataset(test_data_path, tokenizer)
val_data = TweetDataset(val_data_path, tokenizer)

In [8]:
# define data loaders
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

# Defining the Model

In [None]:
# initialize the model
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2)
model.to(device)

In [10]:
# set optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
criterion = torch.nn.BCEWithLogitsLoss()    # alternated between CELoss and BCEWithLogitsLoss



In [11]:
# train the model
epochs = 5
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    # set model to training mode
    model.train()

    train_loss, train_acc = 0, 0
    total_train_preds = 0
    total_train_correct_preds = 0


    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = outputs.loss    # criterion(logits, labels)
        loss.backward()
        optimizer.step()


        train_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        total_train_preds += labels.size(0)
        total_train_correct_preds += torch.sum(preds == labels)



    train_loss /= len(train_dataloader)
    train_acc = total_train_correct_preds.double() / total_train_preds
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}")


    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []

    eval_loss = 0
    eval_acc = 0
    total_val_preds = 0
    total_val_correct_preds = 0
    
    with torch.no_grad():
      for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss     # criterion(outputs, labels)
        logits = outputs.logits

        eval_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        total_val_preds += labels.size(0)
        total_val_correct_preds += torch.sum(preds == labels)

        predictions.extend(preds.cpu().numpy())
        true_labels .extend(labels.cpu().numpy())

            
    eval_loss /= len(val_dataloader)
    eval_acc = total_val_correct_preds.double() / total_val_preds

    # calculate evaluation metrics
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
    print(f"Epoch {epoch+1}/{epochs}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Eval Loss: {eval_loss:.3f}, Eval Acc: {eval_acc:.3f}")



Epoch 1/5
----------
Epoch 1/5, Train Loss: 0.629, Train Acc: 0.651
Precision: 0.744, Recall: 0.274, F1: 0.400
Epoch 1/5: Train Loss: 0.629, Train Acc: 0.651, Eval Loss: 0.581, Eval Acc: 0.700
Epoch 2/5
----------




Epoch 2/5, Train Loss: 0.474, Train Acc: 0.786
Precision: 0.696, Recall: 0.513, F1: 0.590
Epoch 2/5: Train Loss: 0.474, Train Acc: 0.786, Eval Loss: 0.580, Eval Acc: 0.740
Epoch 3/5
----------




Epoch 3/5, Train Loss: 0.283, Train Acc: 0.893
Precision: 0.687, Recall: 0.562, F1: 0.618
Epoch 3/5: Train Loss: 0.283, Train Acc: 0.893, Eval Loss: 0.688, Eval Acc: 0.746
Epoch 4/5
----------




Epoch 4/5, Train Loss: 0.156, Train Acc: 0.950
Precision: 0.651, Recall: 0.605, F1: 0.627
Epoch 4/5: Train Loss: 0.156, Train Acc: 0.950, Eval Loss: 0.744, Eval Acc: 0.737
Epoch 5/5
----------




Epoch 5/5, Train Loss: 0.101, Train Acc: 0.968
Precision: 0.709, Recall: 0.521, F1: 0.601
Epoch 5/5: Train Loss: 0.101, Train Acc: 0.968, Eval Loss: 0.966, Eval Acc: 0.747
