#download data set

In [49]:
!pip install transformers



In [50]:
import pandas as pd
import numpy as np

In [51]:
setswana_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/setswana_tweets.csv'
sesotho_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/sesotho_tweets.csv'

setswana_df = pd.read_csv(setswana_url)
sesotho_df = pd.read_csv(sesotho_url)

In [52]:
setswana_df.shape

(3000, 3)

In [53]:
setswana_df.head()

Unnamed: 0,sentence,Final_Label,predict_name
0,@user lol o dramatic stocko se teng mo lwena mos,positive,Setswana
1,@user i m happy with my current piece job ausi,positive,Setswana
2,o ntate wane a tlang le mane o dieta tsa hae ...,positive,Setswana
3,ka dikuku my love,positive,Setswana
4,@user yeah i doubt much will be done ka kgang ...,negative,Setswana


In [54]:
setswana_df.shape

(3000, 3)

In [55]:
sesotho_df.head()

Unnamed: 0,sentence,Final_labels,predict_name
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho
1,e bata goal spurs,neutral,Sesotho
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi
3,@user lotho hle empa fela ke ipotela,positive,Sesotho
4,@user @user keu utloa hantle,positive,Sesotho


#tokenize dataset

In [56]:
setswana_df['Final_Label'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [57]:
sesotho_df.head()

Unnamed: 0,sentence,Final_labels,predict_name
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho
1,e bata goal spurs,neutral,Sesotho
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi
3,@user lotho hle empa fela ke ipotela,positive,Sesotho
4,@user @user keu utloa hantle,positive,Sesotho


In [58]:
sesotho_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [59]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2

In [60]:
label_map = pd.DataFrame({
    'label': ['negative', 'positive', 'neutral'],
    'key': [0, 1, 2]
})

In [61]:
value_counts = setswana_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [62]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1375
1,positive,1,829
2,neutral,2,796


In [63]:
value_counts = sesotho_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [64]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [65]:
setswana_df['Final_Label'] = [label_to_int(x) for x in setswana_df['Final_Label']]
sesotho_df['Final_Label'] = [label_to_int(x) for x in sesotho_df['Final_Label']]

#tokenize using subword tokenization

In [66]:
from transformers import XLMRobertaTokenizer

In [67]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [68]:
setswana_df['tokens'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['tokens'] = sesotho_df['sentence'].apply(lambda x: tokenizer.tokenize(x))

In [69]:
setswana_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user lol o dramatic stocko se teng mo lwena mos,1,Setswana,"[▁@, user, ▁lol, ▁o, ▁dramatic, ▁stock, o, ▁se..."
1,@user i m happy with my current piece job ausi,1,Setswana,"[▁@, user, ▁i, ▁m, ▁happy, ▁with, ▁my, ▁curren..."
2,o ntate wane a tlang le mane o dieta tsa hae ...,1,Setswana,"[▁o, ▁n, tate, ▁wa, ne, ▁a, ▁t, lang, ▁le, ▁ma..."
3,ka dikuku my love,1,Setswana,"[▁ka, ▁di, ku, ku, ▁my, ▁love]"
4,@user yeah i doubt much will be done ka kgang ...,0,Setswana,"[▁@, user, ▁yeah, ▁i, ▁doubt, ▁much, ▁will, ▁b..."


In [70]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,"[▁@, user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁ba..."
1,e bata goal spurs,2,Sesotho,"[▁e, ▁bata, ▁goal, ▁, spur, s]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,"[▁@, user, ▁@, user, ▁ke, ▁na, hana, ▁taba, ▁e..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,"[▁@, user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a,..."
4,@user @user keu utloa hantle,1,Sesotho,"[▁@, user, ▁@, user, ▁ke, u, ▁ut, lo, a, ▁han,..."


#process tokens

In [71]:
setswana_encoding = tokenizer(setswana_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)
sesotho_encoding = tokenizer(sesotho_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [72]:
setswana_encoding

{'input_ids': tensor([[    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    36,   653,  ...,     1,     1,     1],
        ...,
        [    0,  1777,   497,  ...,     1,     1,     1],
        [    0,  1374, 65918,  ...,     1,     1,     1],
        [    0,    79,  9227,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [73]:
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import torch
from sklearn.model_selection import train_test_split

# input_ids = setswana_encoding['input_ids']
# attention_mask = setswana_encoding['attention_mask']
# labels = torch.tensor(setswana_df['Final_Label'].tolist())

# setswana_dataset = TensorDataset(input_ids, labels)
# train_size = int(0.8 * len(setswana_dataset))
# val_size = len(setswana_dataset) - train_size

# train_ds, val_ds = random_split(setswana_dataset, [train_size, val_size])

# train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
# val_dl = DataLoader(val_ds, batch_size=32)



In [74]:
def train_val_dataloader(encodings,data_labels):
  input_ids = encodings
  labels = data_labels

  dataset = TensorDataset(input_ids, labels)
  train_size = int(0.8 * len(dataset))
  val_size = len(dataset) - train_size

  train_ds, val_ds = random_split(dataset, [train_size, val_size])

  train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=32)

  return train_dl, val_dl

#Create neural model

In [75]:
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [76]:
model = TweetClassifier(vocab_size=tokenizer.vocab_size, embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [77]:
train_dl, val_dl = train_val_dataloader(setswana_encoding['input_ids'], torch.tensor(setswana_df['Final_Label'].tolist()))

In [78]:
# for epoch in range(5):
#     model.train()
#     total_loss = 0

#     for xb, yb, in train_dl:
#         preds = model(xb)
#         loss = loss_fn(preds, yb)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


#evaluate model

In [79]:
# model.eval()
# correct, total = 0, 0

# with torch.no_grad():
#     for xb, yb in val_dl:
#         preds = model(xb)
#         predicted = torch.argmax(preds, dim=1)
#         correct += (predicted == yb).sum().item()
#         total += yb.size(0)

# print(f"Validation Accuracy: {correct / total:.2%}")


In [80]:
def train_eval_model(model, epochs,train_dl,val_dl):
    for epoch in range(epochs):
      model.train()
      total_loss = 0

      for xb, yb, in train_dl:
          preds = model(xb)
          loss = loss_fn(preds, yb)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            predicted = torch.argmax(preds, dim=1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)

    print(f"Validation Accuracy: {correct / total:.2%}")


In [81]:
train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 81.6802
Epoch 2, Loss: 71.0243
Epoch 3, Loss: 64.9169
Epoch 4, Loss: 56.8745
Epoch 5, Loss: 48.8206
Epoch 6, Loss: 41.5880
Epoch 7, Loss: 35.9109
Epoch 8, Loss: 30.8065
Epoch 9, Loss: 24.9714
Epoch 10, Loss: 21.3469
Validation Accuracy: 44.17%


#Word tokenization

In [82]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')



tokenized = [word_tokenize(s.lower()) for s in setswana_df['sentence'].tolist()]

# Build a vocabulary
vocab = {"<pad>": 0, "<unk>": 1}
for sent in tokenized:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [83]:
SEQ_LEN = 64

def encode(sent):
    tokens = word_tokenize(sent.lower())
    token_ids = [vocab.get(tok, vocab["<unk>"]) for tok in tokens]
    if len(token_ids) < SEQ_LEN:
        token_ids += [vocab["<pad>"]] * (SEQ_LEN - len(token_ids))
    else:
        token_ids = token_ids[:SEQ_LEN]
    return token_ids



In [84]:
class TweetDataset():
    def __init__(self, texts, labels):
        self.texts = texts
        self.encoded_texts = torch.tensor([encode(t) for t in texts])
        self.labels = labels

In [85]:
dataset = TweetDataset(setswana_df['sentence'].tolist(), setswana_df['Final_Label'].tolist())

In [86]:
dataset.texts[:5]

['@user lol o dramatic stocko se teng mo lwena mos ',
 '@user i m happy with my current piece job ausi ',
 ' o ntate wane a tlang le mane o dieta tsa hae some people don t grow ',
 'ka dikuku my love ',
 '@user yeah i doubt much will be done ka kgang ya teng']

In [87]:
dataset.encoded_texts[:5]

tensor([[ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  3, 13, 14, 15, 16, 17, 18, 19, 20, 21,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5, 22, 23, 24, 25, 26, 27,  5, 28, 29, 30, 31, 32, 33, 34, 35,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [36, 37, 17, 38,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,

In [88]:
# input_ids = dataset.encoded_texts
# labels = torch.tensor(dataset.labels)

In [89]:

# setswana_dataset = TensorDataset(input_ids, labels)
# train_size = int(0.8 * len(setswana_dataset))
# val_size = len(setswana_dataset) - train_size

# train_ds, val_ds = random_split(setswana_dataset, [train_size, val_size])

# train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
# val_dl = DataLoader(val_ds, batch_size=32)

In [90]:
model = TweetClassifier(vocab_size=len(vocab), embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [91]:
train_dl, val_dl = train_val_dataloader(dataset.encoded_texts, torch.tensor(setswana_df['Final_Label'].tolist()))

In [92]:
# for epoch in range(5):
#     model.train()
#     total_loss = 0

#     for xb, yb, in train_dl:
#         preds = model(xb)
#         loss = loss_fn(preds, yb)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [93]:
# model.eval()
# correct, total = 0, 0

# with torch.no_grad():
#     for xb, yb in val_dl:
#         preds = model(xb)
#         predicted = torch.argmax(preds, dim=1)
#         correct += (predicted == yb).sum().item()
#         total += yb.size(0)

# print(f"Validation Accuracy: {correct / total:.2%}")

In [94]:
train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 78.9747
Epoch 2, Loss: 55.2313
Epoch 3, Loss: 36.7216
Epoch 4, Loss: 21.2692
Epoch 5, Loss: 12.1458
Epoch 6, Loss: 6.9473
Epoch 7, Loss: 4.2246
Epoch 8, Loss: 2.6303
Epoch 9, Loss: 1.7799
Epoch 10, Loss: 1.3369
Validation Accuracy: 46.17%


In [95]:
# class TweetClassifierLSTM(nn.Module):
#     def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
#         super(TweetClassifier, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, output_dim)

#     def forward(self, x):
#         embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
#         flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
#         out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
#         return self.fc2(out)                      # (batch_size, output_dim)

In [None]:
# classify tokens into respective classess using the model2

