#download data set

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np

In [None]:
setswana_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/setswana_tweets.csv'
sesotho_url = 'https://raw.githubusercontent.com/NLPforLRLsProjects/SAfriSenti-Corpus/refs/heads/main/sesotho_tweets.csv'

setswana_df = pd.read_csv(setswana_url)
sesotho_df = pd.read_csv(sesotho_url)

#tokenize dataset

In [None]:
setswana_df['Final_Label'].unique()

In [None]:
sesotho_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [None]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2


In [None]:
sesotho_df.head()

In [None]:
setswana_df['Final_Label'] = [label_to_int(x) for x in setswana_df['Final_Label']]
sesotho_df['Final_Label'] = [label_to_int(x) for x in sesotho_df['Final_Label']]

In [None]:
from transformers import XLMRobertaTokenizer

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
setswana_df['tokens'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['tokens'] = sesotho_df['sentence'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
setswana_df.head()

In [None]:
sesotho_df.head()

#process tokens

In [None]:
setswana_encoding = tokenizer(setswana_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)
sesotho_encoding = tokenizer(sesotho_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [None]:
setswana_encoding

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch

input_ids = setswana_encoding['input_ids']
attention_mask = setswana_encoding['attention_mask']
labels = torch.tensor(setswana_df['Final_Label'].tolist())



In [None]:
from sklearn.model_selection import train_test_split

setswana_dataset = TensorDataset(input_ids, labels)
train_size = int(0.8 * len(setswana_dataset))
val_size = len(setswana_dataset) - train_size

train_ds, val_ds = random_split(setswana_dataset, [train_size, val_size])

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)

#Create neural model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [None]:
model = TweetClassifier(vocab_size=tokenizer.vocab_size, embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(5):
    model.train()
    total_loss = 0

    for xb, yb, in train_dl:
        preds = model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


#evaluate model

In [None]:
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for xb, yb in val_dl:
        preds = model(xb)
        predicted = torch.argmax(preds, dim=1)
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

print(f"Validation Accuracy: {correct / total:.2%}")
