#download data set

In [11]:
!pip install transformers



In [12]:
import pandas as pd
import numpy as np

In [13]:
setswana_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/setswana_tweets.csv'
sesotho_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/sesotho_tweets.csv'

setswana_df = pd.read_csv(setswana_url)
sesotho_df = pd.read_csv(sesotho_url)

#tokenize dataset

In [14]:
setswana_df['Final_Label'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [15]:
sesotho_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [16]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2


In [17]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho
1,e bata goal spurs,neutral,Sesotho
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi
3,@user lotho hle empa fela ke ipotela,positive,Sesotho
4,@user @user keu utloa hantle,positive,Sesotho


In [18]:
setswana_df['Final_Label'] = [label_to_int(x) for x in setswana_df['Final_Label']]
sesotho_df['Final_Label'] = [label_to_int(x) for x in sesotho_df['Final_Label']]

In [19]:
from transformers import XLMRobertaTokenizer

In [20]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [21]:
setswana_df['tokens'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['tokens'] = sesotho_df['sentence'].apply(lambda x: tokenizer.tokenize(x))

In [22]:
setswana_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user lol o dramatic stocko se teng mo lwena mos,1,Setswana,"[▁@, user, ▁lol, ▁o, ▁dramatic, ▁stock, o, ▁se..."
1,@user i m happy with my current piece job ausi,1,Setswana,"[▁@, user, ▁i, ▁m, ▁happy, ▁with, ▁my, ▁curren..."
2,o ntate wane a tlang le mane o dieta tsa hae ...,1,Setswana,"[▁o, ▁n, tate, ▁wa, ne, ▁a, ▁t, lang, ▁le, ▁ma..."
3,ka dikuku my love,1,Setswana,"[▁ka, ▁di, ku, ku, ▁my, ▁love]"
4,@user yeah i doubt much will be done ka kgang ...,0,Setswana,"[▁@, user, ▁yeah, ▁i, ▁doubt, ▁much, ▁will, ▁b..."


In [23]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,"[▁@, user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁ba..."
1,e bata goal spurs,2,Sesotho,"[▁e, ▁bata, ▁goal, ▁, spur, s]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,"[▁@, user, ▁@, user, ▁ke, ▁na, hana, ▁taba, ▁e..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,"[▁@, user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a,..."
4,@user @user keu utloa hantle,1,Sesotho,"[▁@, user, ▁@, user, ▁ke, u, ▁ut, lo, a, ▁han,..."


#process tokens

In [41]:
setswana_encoding = tokenizer(setswana_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)
sesotho_encoding = tokenizer(sesotho_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [42]:
setswana_encoding

{'input_ids': tensor([[    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    36,   653,  ...,     1,     1,     1],
        ...,
        [    0,  1777,   497,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    79,  9227,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [43]:
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch

input_ids = setswana_encoding['input_ids']
attention_mask = setswana_encoding['attention_mask']
labels = torch.tensor(setswana_df['Final_Label'].tolist())



In [44]:
from sklearn.model_selection import train_test_split

setswana_dataset = TensorDataset(input_ids, labels)
train_size = int(0.8 * len(setswana_dataset))
val_size = len(setswana_dataset) - train_size

train_ds, val_ds = random_split(setswana_dataset, [train_size, val_size])

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)

#Create neural model

In [45]:
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [46]:
model = TweetClassifier(vocab_size=tokenizer.vocab_size, embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [47]:
for epoch in range(5):
    model.train()
    total_loss = 0

    for xb, yb, in train_dl:
        preds = model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 79.4898
Epoch 2, Loss: 68.7145
Epoch 3, Loss: 63.5615
Epoch 4, Loss: 56.1852
Epoch 5, Loss: 48.3674


#evaluate model

In [48]:
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for xb, yb in val_dl:
        preds = model(xb)
        predicted = torch.argmax(preds, dim=1)
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

print(f"Validation Accuracy: {correct / total:.2%}")


Validation Accuracy: 46.00%
