#download data set

In [93]:
!pip install transformers



In [94]:
import pandas as pd
import numpy as np

In [95]:
setswana_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/setswana_tweets.csv'
sesotho_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/sesotho_tweets.csv'

setswana_df = pd.read_csv(setswana_url)
sesotho_df = pd.read_csv(sesotho_url)

In [96]:
setswana_df.shape

(3000, 3)

In [97]:
setswana_df.head()

Unnamed: 0,sentence,Final_Label,predict_name
0,@user lol o dramatic stocko se teng mo lwena mos,positive,Setswana
1,@user i m happy with my current piece job ausi,positive,Setswana
2,o ntate wane a tlang le mane o dieta tsa hae ...,positive,Setswana
3,ka dikuku my love,positive,Setswana
4,@user yeah i doubt much will be done ka kgang ...,negative,Setswana


In [98]:
setswana_df.shape

(3000, 3)

In [99]:
sesotho_df.head()

Unnamed: 0,sentence,Final_labels,predict_name
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho
1,e bata goal spurs,neutral,Sesotho
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi
3,@user lotho hle empa fela ke ipotela,positive,Sesotho
4,@user @user keu utloa hantle,positive,Sesotho


#tokenize dataset

In [100]:
setswana_df['Final_Label'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [101]:
sesotho_df.head()

Unnamed: 0,sentence,Final_labels,predict_name
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho
1,e bata goal spurs,neutral,Sesotho
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi
3,@user lotho hle empa fela ke ipotela,positive,Sesotho
4,@user @user keu utloa hantle,positive,Sesotho


In [102]:
sesotho_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [103]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2

In [104]:
label_map = pd.DataFrame({
    'label': ['negative', 'positive', 'neutral'],
    'key': [0, 1, 2]
})

In [105]:
value_counts = setswana_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [106]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1375
1,positive,1,829
2,neutral,2,796


In [107]:
value_counts = sesotho_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [108]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [109]:
setswana_df['Final_Label'] = [label_to_int(x) for x in setswana_df['Final_Label']]
sesotho_df['Final_Label'] = [label_to_int(x) for x in sesotho_df['Final_Label']]

#tokenize using subword tokenization

In [110]:
from transformers import XLMRobertaTokenizer

In [111]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [112]:
setswana_df['tokens'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['tokens'] = sesotho_df['sentence'].apply(lambda x: tokenizer.tokenize(x))

In [113]:
setswana_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user lol o dramatic stocko se teng mo lwena mos,1,Setswana,"[▁@, user, ▁lol, ▁o, ▁dramatic, ▁stock, o, ▁se..."
1,@user i m happy with my current piece job ausi,1,Setswana,"[▁@, user, ▁i, ▁m, ▁happy, ▁with, ▁my, ▁curren..."
2,o ntate wane a tlang le mane o dieta tsa hae ...,1,Setswana,"[▁o, ▁n, tate, ▁wa, ne, ▁a, ▁t, lang, ▁le, ▁ma..."
3,ka dikuku my love,1,Setswana,"[▁ka, ▁di, ku, ku, ▁my, ▁love]"
4,@user yeah i doubt much will be done ka kgang ...,0,Setswana,"[▁@, user, ▁yeah, ▁i, ▁doubt, ▁much, ▁will, ▁b..."


In [114]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,"[▁@, user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁ba..."
1,e bata goal spurs,2,Sesotho,"[▁e, ▁bata, ▁goal, ▁, spur, s]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,"[▁@, user, ▁@, user, ▁ke, ▁na, hana, ▁taba, ▁e..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,"[▁@, user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a,..."
4,@user @user keu utloa hantle,1,Sesotho,"[▁@, user, ▁@, user, ▁ke, u, ▁ut, lo, a, ▁han,..."


#process tokens

In [115]:
setswana_encoding = tokenizer(setswana_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)
sesotho_encoding = tokenizer(sesotho_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [116]:
setswana_encoding

{'input_ids': tensor([[    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    36,   653,  ...,     1,     1,     1],
        ...,
        [    0,  1777,   497,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    79,  9227,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [117]:
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import torch
from sklearn.model_selection import train_test_split



In [118]:
def train_val_dataloader(encodings,data_labels):
  input_ids = encodings
  labels = data_labels

  dataset = TensorDataset(input_ids, labels)
  train_size = int(0.8 * len(dataset))
  val_size = len(dataset) - train_size

  train_ds, val_ds = random_split(dataset, [train_size, val_size])

  train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=32)

  return train_dl, val_dl

#Create neural model

In [119]:
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [120]:
model = TweetClassifier(vocab_size=tokenizer.vocab_size, embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [121]:
train_dl, val_dl = train_val_dataloader(setswana_encoding['input_ids'], torch.tensor(setswana_df['Final_Label'].tolist()))

#evaluate model

In [122]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [123]:
def train_eval_model(model, epochs,train_dl,val_dl):
    for epoch in range(epochs):
      model.train()
      total_loss = 0

      for xb, yb, in train_dl:
          preds = model(xb)
          loss = loss_fn(preds, yb)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    correct, total = 0, 0

    predictions = []
    true_labels = []

    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            predicted = torch.argmax(preds, dim=1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)

            predictions.extend(predicted)
            true_labels.extend(yb)


    print(f"Validation Accuracy: {correct / total:.2%}")

    return predictions, true_labels


In [124]:
predictions, true_labels = train_eval_model(model,1,train_dl,val_dl)

Epoch 1, Loss: 85.5257
Validation Accuracy: 47.00%


In [125]:
predictions = np.array(predictions)
true_labels = np.array(true_labels)

In [126]:
accuracy = accuracy_score(true_labels, predictions)
micro_precision = precision_score(true_labels, predictions, average='micro')
micro_recall = recall_score(true_labels, predictions, average='micro')
micro_f1 = f1_score(true_labels, predictions, average='micro')

print(f"Accuracy: {accuracy:.4%}")
print(f"Micro-average Precision: {micro_precision:.4%}")
print(f"Micro-average Recall: {micro_recall:.4%}")
print(f"Micro-average F1-score: {micro_f1:.4%}")

Accuracy: 47.0000%
Micro-average Precision: 47.0000%
Micro-average Recall: 47.0000%
Micro-average F1-score: 47.0000%


In [127]:

macro_precision = precision_score(true_labels, predictions, average='macro', zero_division=0)
macro_recall = recall_score(true_labels, predictions, average='macro', zero_division=0)
macro_f1 = f1_score(true_labels, predictions, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.4%}")
print(f"Macro-average Precision: {macro_precision:.4%}")
print(f"Macro-average Recall: {macro_recall:.4%}")
print(f"Macro-average F1-score: {macro_f1:.4%}")

Accuracy: 47.0000%
Macro-average Precision: 29.4907%
Macro-average Recall: 37.6071%
Macro-average F1-score: 30.4885%


In [128]:
weighted_precision = precision_score(true_labels, predictions, average='weighted', zero_division=0)
weighted_recall = recall_score(true_labels, predictions, average='weighted', zero_division=0)
weighted_f1 = f1_score(true_labels, predictions, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.4%}")
print(f"Weighted Precision: {weighted_precision:.4%}")
print(f"Weighted Recall: {weighted_recall:.4%}")
print(f"Weighted F1-score: {weighted_f1:.4%}")

Accuracy: 47.0000%
Weighted Precision: 32.5307%
Weighted Recall: 47.0000%
Weighted F1-score: 36.1053%


#Word tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')



tokenized = [word_tokenize(s.lower()) for s in setswana_df['sentence'].tolist()]

# Build a vocabulary
vocab = {"<pad>": 0, "<unk>": 1}
for sent in tokenized:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)


In [None]:
SEQ_LEN = 64

def encode(sent):
    tokens = word_tokenize(sent.lower())
    token_ids = [vocab.get(tok, vocab["<unk>"]) for tok in tokens]
    if len(token_ids) < SEQ_LEN:
        token_ids += [vocab["<pad>"]] * (SEQ_LEN - len(token_ids))
    else:
        token_ids = token_ids[:SEQ_LEN]
    return token_ids



In [None]:
class TweetDataset():
    def __init__(self, texts, labels):
        self.texts = texts
        self.encoded_texts = torch.tensor([encode(t) for t in texts])
        self.labels = labels

In [None]:
dataset = TweetDataset(setswana_df['sentence'].tolist(), setswana_df['Final_Label'].tolist())

In [None]:
dataset.texts[:5]

In [None]:
dataset.encoded_texts[:5]

In [None]:
model = TweetClassifier(vocab_size=len(vocab), embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
train_dl, val_dl = train_val_dataloader(dataset.encoded_texts, torch.tensor(setswana_df['Final_Label'].tolist()))

In [None]:
train_eval_model(model,10,train_dl,val_dl)

# STF-idf

In [163]:
sesotho_df

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,"[▁@, user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁ba..."
1,e bata goal spurs,2,Sesotho,"[▁e, ▁bata, ▁goal, ▁, spur, s]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,"[▁@, user, ▁@, user, ▁ke, ▁na, hana, ▁taba, ▁e..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,"[▁@, user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a,..."
4,@user @user keu utloa hantle,1,Sesotho,"[▁@, user, ▁@, user, ▁ke, u, ▁ut, lo, a, ▁han,..."
...,...,...,...,...
2995,@user @user ntho diqala jwalo rumoursthe knes ...,0,Sesotho,"[▁@, user, ▁@, user, ▁n, tho, ▁di, qala, ▁j, w..."
2996,@user iΓö£┬╗Γö¼Γò£m on leave ntse ke sheba hen...,0,Sesotho,"[▁@, user, ▁i, Γ, ö, £, ┬, ╗, Γ, ö, 1⁄4, Γ, ò,..."
2997,he must come to botswana a re thuse that bogus...,0,multi,"[▁he, ▁must, ▁come, ▁to, ▁bot, s, wana, ▁a, ▁r..."
2998,@user wa bona ea hlokomela something fishy moh...,0,Sesotho,"[▁@, user, ▁wa, ▁bona, ▁ea, ▁h, loko, mela, ▁s..."


In [164]:
from collections import defaultdict
import math

def char_ngrams(text, n=3):
    # Basic character n-gram generation
    text = '#' * (n - 1) + text + '#' * (n - 1) # Padding for start/end n-grams
    return [text[i:i+n] for i in range(len(text) - n + 1)]

documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Example with character 3-grams
subword_documents = [char_ngrams(doc.lower(), n=3) for doc in documents]
print(subword_documents)

[['##t', '#th', 'thi', 'his', 'is ', 's i', ' is', 'is ', 's t', ' th', 'the', 'he ', 'e f', ' fi', 'fir', 'irs', 'rst', 'st ', 't d', ' do', 'doc', 'ocu', 'cum', 'ume', 'men', 'ent', 'nt.', 't.#', '.##'], ['##t', '#th', 'thi', 'his', 'is ', 's d', ' do', 'doc', 'ocu', 'cum', 'ume', 'men', 'ent', 'nt ', 't i', ' is', 'is ', 's t', ' th', 'the', 'he ', 'e s', ' se', 'sec', 'eco', 'con', 'ond', 'nd ', 'd d', ' do', 'doc', 'ocu', 'cum', 'ume', 'men', 'ent', 'nt.', 't.#', '.##'], ['##a', '#an', 'and', 'nd ', 'd t', ' th', 'thi', 'his', 'is ', 's i', ' is', 'is ', 's t', ' th', 'the', 'he ', 'e t', ' th', 'thi', 'hir', 'ird', 'rd ', 'd o', ' on', 'one', 'ne.', 'e.#', '.##'], ['##i', '#is', 'is ', 's t', ' th', 'thi', 'his', 'is ', 's t', ' th', 'the', 'he ', 'e f', ' fi', 'fir', 'irs', 'rst', 'st ', 't d', ' do', 'doc', 'ocu', 'cum', 'ume', 'men', 'ent', 'nt?', 't?#', '?##']]


In [165]:
subwords_tokenised_documnets = [" ".join(s) for s in sesotho_df['tokens']]
subwords_tokenised_documnets[0:3]

['▁@ user ▁gwa ▁t shwa na ▁rena ▁ba ▁bang ▁a ▁re ▁kre ye ▁se lo ▁mos',
 '▁e ▁bata ▁goal ▁ spur s',
 '▁@ user ▁@ user ▁ke ▁na hana ▁taba ▁en o ▁ea ▁ho ▁bat la ▁ho ▁khe tha ▁ho bane ▁re ▁she bile ▁our ▁own ▁benefits ▁re ▁le ▁bat ho ▁ke ▁e ona ▁e ▁sent seng ▁polo tik i ▁ea ▁les ot ho ▁so ▁we ▁went ▁behind ▁as sho les ▁for ▁years ▁co z ▁re ▁she bile ▁mele mo ▁e ▁direct ▁rele ▁bat ho ▁a ▁ee a ▁sebe let sa ▁n']

In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

# Using character n-grams for TfidfVectorizer directly
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2) # Example char 2-4-grams, min_df to filter rare ones

stf_idf_matrix = stfidf_vectorizer.fit_transform(subwords_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())



# Convert to PyTorch tensor
stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)


torch.Size([3000, 8887])
8887


In [167]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(setswana_df['Final_Label'].tolist()))

In [168]:
class STFIDFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(STFIDFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))             # (batch_size, hidden_dim)
        return self.fc4(out)                      # (batch_size, output_dim)


In [169]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [170]:
predictions, true_labels = train_eval_model(model,20,train_dl,val_dl)

Epoch 1, Loss: 80.4579
Epoch 2, Loss: 74.9905
Epoch 3, Loss: 60.3311
Epoch 4, Loss: 37.3955
Epoch 5, Loss: 20.8297
Validation Accuracy: 33.33%


In [171]:
predictions = np.array(predictions)
true_labels = np.array(true_labels)

In [172]:
accuracy = accuracy_score(true_labels, predictions)
micro_precision = precision_score(true_labels, predictions, average='micro')
micro_recall = recall_score(true_labels, predictions, average='micro')
micro_f1 = f1_score(true_labels, predictions, average='micro')

print(f"Accuracy: {accuracy:.4%}")
print(f"Micro-average Precision: {micro_precision:.4%}")
print(f"Micro-average Recall: {micro_recall:.4%}")
print(f"Micro-average F1-score: {micro_f1:.4%}")

Accuracy: 33.3333%
Micro-average Precision: 33.3333%
Micro-average Recall: 33.3333%
Micro-average F1-score: 33.3333%


In [173]:
macro_precision = precision_score(true_labels, predictions, average='macro', zero_division=0)
macro_recall = recall_score(true_labels, predictions, average='macro', zero_division=0)
macro_f1 = f1_score(true_labels, predictions, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.4%}")
print(f"Macro-average Precision: {macro_precision:.4%}")
print(f"Macro-average Recall: {macro_recall:.4%}")
print(f"Macro-average F1-score: {macro_f1:.4%}")

Accuracy: 33.3333%
Macro-average Precision: 30.7095%
Macro-average Recall: 30.7401%
Macro-average F1-score: 30.6349%
