# Imports

In [2]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from transformers import XLMRobertaTokenizer, AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments

import os

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
from torch import nn
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from google.colab import drive
import matplotlib.pyplot as plt



In [3]:
# Check if  is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is not available. Using CPU.


#Data loading and processing

In [4]:
!pip install transformers



In [5]:
sesotho_tweets_url = 'https://raw.githubusercontent.com/Khotso-Bore/neural-languange-model/refs/heads/main/sesotho_tweets.csv'
sesotho_headlines_url = 'https://raw.githubusercontent.com/Khotso-Bore/neural-languange-model/refs/heads/main/Transformed_NewsSA_Dataset.csv'

sesotho_tweets_df = pd.read_csv(sesotho_tweets_url)
sesotho_headlines_df = pd.read_csv(sesotho_headlines_url)

In [6]:
def clean_text(text):
    text = text.lower() #make lowercase
    text = re.sub(r'@\w+|https?\S+|www\S+|#|\d+', '', text) #remove mention, hashtags
    text = re.sub(r'\d+', '', text) #remove numbers
    text = re.sub(r'https\S+|www\S+', '', text) #remove urls
    text = re.sub(r'[^\w\s]', '', text) #remove special charactewrs
    text = text.encode('ascii', 'ignore').decode('ascii') #remove emojis
    text = re.sub(r'\s+', ' ', text).strip() #remove extra whitespace

    return text

In [7]:
sesotho_tweets_df['cleaned_sentence'] = sesotho_tweets_df['sentence'].apply(clean_text).astype(str)
sesotho_headlines_df['cleaned_sentence'] = sesotho_headlines_df['sentence'].apply(clean_text).astype(str)

## prepare dataset

In [8]:
sesotho_tweets_df.head()

Unnamed: 0,sentence,Final_labels,predict_name,cleaned_sentence
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho,gwa tshwana rena ba bang a re kreye selo mos
1,e bata goal spurs,neutral,Sesotho,e bata goal spurs
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi,ke nahana taba eno ea ho batla ho khetha hoban...
3,@user lotho hle empa fela ke ipotela,positive,Sesotho,lotho hle empa fela ke ipotela
4,@user @user keu utloa hantle,positive,Sesotho,keu utloa hantle


In [9]:
sesotho_headlines_df.head()

Unnamed: 0,sentence,label,cleaned_sentence
0,BASUOE BA QOSUOE KA PELAELO EA HO BOLAEA MOSHE...,-1,basuoe ba qosuoe ka pelaelo ea ho bolaea moshe...
1,TSEBA MARENA A SEHLOOHO A NAHA,0,tseba marena a sehlooho a naha
2,LINTLHA-KHOLO MABAPI LE NTLO E OETSENG BATHO,-1,lintlhakholo mabapi le ntlo e oetseng batho
3,MOTHO O KHAOTSOE BOTONA LE MENOANA A NTSE A PHELA,-1,motho o khaotsoe botona le menoana a ntse a phela
4,LITABA TSE BOHLOKO HO MAQHEKU LE MAQHEKOANA NA...,-1,litaba tse bohloko ho maqheku le maqhekoana na...


In [10]:
sesotho_headlines_df['label'] = sesotho_headlines_df['label'].str.replace(r'[a-zA-Z]', '', regex=True)
sesotho_headlines_df['label'] = sesotho_headlines_df['label'].astype(int)
sesotho_headlines_df['label'] = np.where(sesotho_headlines_df['label'] == 0, 2, sesotho_headlines_df['label'])
sesotho_headlines_df['label'] = np.where(sesotho_headlines_df['label'] == -1, 0, sesotho_headlines_df['label'])
sesotho_headlines_df.head()

Unnamed: 0,sentence,label,cleaned_sentence
0,BASUOE BA QOSUOE KA PELAELO EA HO BOLAEA MOSHE...,0,basuoe ba qosuoe ka pelaelo ea ho bolaea moshe...
1,TSEBA MARENA A SEHLOOHO A NAHA,2,tseba marena a sehlooho a naha
2,LINTLHA-KHOLO MABAPI LE NTLO E OETSENG BATHO,0,lintlhakholo mabapi le ntlo e oetseng batho
3,MOTHO O KHAOTSOE BOTONA LE MENOANA A NTSE A PHELA,0,motho o khaotsoe botona le menoana a ntse a phela
4,LITABA TSE BOHLOKO HO MAQHEKU LE MAQHEKOANA NA...,0,litaba tse bohloko ho maqheku le maqhekoana na...


In [11]:
sesotho_headlines_df.rename(columns={'label': 'Final_Label'}, inplace=True)
sesotho_tweets_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [12]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2

In [13]:
label_map = pd.DataFrame({
    'label': ['negative', 'positive', 'neutral'],
    'key': [0, 1, 2]
})

In [14]:
value_counts = sesotho_tweets_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [15]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [16]:
value_counts = sesotho_headlines_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1552
1,positive,1,551
2,neutral,2,106


In [17]:

sesotho_tweets_df['Final_Label'] = sesotho_tweets_df['Final_Label'].apply(label_to_int).astype(int)

In [18]:
value_counts = sesotho_tweets_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [19]:
sesotho_tweets_df.dtypes

Unnamed: 0,0
sentence,object
Final_Label,int64
predict_name,object
cleaned_sentence,object


In [20]:
value_counts = sesotho_headlines_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1552
1,positive,1,551
2,neutral,2,106


In [21]:
sesotho_headlines_df.dtypes

Unnamed: 0,0
sentence,object
Final_Label,int64
cleaned_sentence,object


In [22]:
sesotho_df = pd.concat([sesotho_tweets_df, sesotho_headlines_df],axis=0,ignore_index=True)
sesotho_df['Final_Label'] = sesotho_df['Final_Label'].astype(int)

In [23]:
sesotho_df.columns

Index(['sentence', 'Final_Label', 'predict_name', 'cleaned_sentence'], dtype='object')

In [24]:
sesotho_df.shape

(5209, 4)

In [25]:
sesotho_df['Final_Label'].value_counts()

Unnamed: 0_level_0,count
Final_Label,Unnamed: 1_level_1
0,3016
1,1504
2,689


In [26]:
sesotho_df['cleaned_sentence'].shape

(5209,)

In [27]:
value_counts = sesotho_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,3016
1,positive,1,1504
2,neutral,2,689


In [28]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,cleaned_sentence
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,gwa tshwana rena ba bang a re kreye selo mos
1,e bata goal spurs,2,Sesotho,e bata goal spurs
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,ke nahana taba eno ea ho batla ho khetha hoban...
3,@user lotho hle empa fela ke ipotela,1,Sesotho,lotho hle empa fela ke ipotela
4,@user @user keu utloa hantle,1,Sesotho,keu utloa hantle


## tokenize using subword tokenization

In [29]:
bpe_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
word_piece_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
afro_xmlr_tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [30]:
#setswana_df['tokens_bpe'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['bpe_tokens'] = sesotho_df['cleaned_sentence'].apply(lambda x: bpe_tokenizer.tokenize(x))
sesotho_df['word_piece_tokens'] = sesotho_df['cleaned_sentence'].apply(lambda x: word_piece_tokenizer.tokenize(x))
sesotho_df['afro_xmlr_tokens'] = sesotho_df['cleaned_sentence'].apply(lambda x: word_piece_tokenizer.tokenize(x))

In [31]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,cleaned_sentence,bpe_tokens,word_piece_tokens,afro_xmlr_tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,gwa tshwana rena ba bang a re kreye selo mos,"[▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁bang, ▁a, ▁r...","[g, ##wa, ts, ##hwa, ##na, ren, ##a, ba, bang,...","[g, ##wa, ts, ##hwa, ##na, ren, ##a, ba, bang,..."
1,e bata goal spurs,2,Sesotho,e bata goal spurs,"[▁e, ▁bata, ▁goal, ▁, spur, s]","[e, bat, ##a, goal, spurs]","[e, bat, ##a, goal, spurs]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,ke nahana taba eno ea ho batla ho khetha hoban...,"[▁ke, ▁na, hana, ▁taba, ▁en, o, ▁ea, ▁ho, ▁bat...","[ke, nah, ##ana, tab, ##a, en, ##o, ea, ho, ba...","[ke, nah, ##ana, tab, ##a, en, ##o, ea, ho, ba..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,lotho hle empa fela ke ipotela,"[▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a, ▁ke, ▁i, ...","[lot, ##ho, h, ##le, em, ##pa, fe, ##la, ke, i...","[lot, ##ho, h, ##le, em, ##pa, fe, ##la, ke, i..."
4,@user @user keu utloa hantle,1,Sesotho,keu utloa hantle,"[▁ke, u, ▁ut, lo, a, ▁han, tle]","[ke, ##u, ut, ##lo, ##a, han, ##tle]","[ke, ##u, ut, ##lo, ##a, han, ##tle]"


## process tokens

In [32]:

sesotho_bpe_encoding = bpe_tokenizer(sesotho_df['cleaned_sentence'].tolist(), padding='max_length', truncation=True, return_tensors='pt',max_length=128)
sesotho_word_piece_encoding = word_piece_tokenizer(sesotho_df['cleaned_sentence'].tolist(), padding='max_length', truncation=True, return_tensors='pt',max_length=128)
sesotho_afro_xmlr_encoding = afro_xmlr_tokenizer(sesotho_df['cleaned_sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=128)

In [33]:
sesotho_bpe_encoding

{'input_ids': tensor([[    0, 39305,   808,  ...,     1,     1,     1],
        [    0,    28,  8336,  ...,     1,     1,     1],
        [    0,   311,    24,  ...,     1,     1,     1],
        ...,
        [    0, 97549,    39,  ...,     1,     1,     1],
        [    0, 30078,  2590,  ...,     1,     1,     1],
        [    0, 22711,    28,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [34]:
sesotho_word_piece_encoding['input_ids'].shape

torch.Size([5209, 128])

# Fine Tuning

In [35]:


train_texts, test_texts, train_labels, test_labels = train_test_split(
    sesotho_df['cleaned_sentence'], sesotho_df['Final_Label'], test_size=0.2, random_state=42
)

train_encodings = afro_xmlr_tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = afro_xmlr_tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

In [36]:

class SesothoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SesothoDataset(train_encodings, train_labels)
test_dataset = SesothoDataset(test_encodings, test_labels)



In [37]:


class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1, 2]),
    y=sesotho_df['Final_Label'].values  # Make sure this is a 1D NumPy array
)

print("\nClass Weights (for balancing):")
print(f"Positive (0): {class_weights[0]:.2f}")
print(f"Negative (1): {class_weights[1]:.2f}")
print(f"Neutral (2): {class_weights[2]:.2f}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = torch.tensor(class_weights, dtype=torch.float32).to(device)  # Convert to tensor

# 2. Create a custom Trainer class
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=weights)  # Apply your weights here
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


Class Weights (for balancing):
Positive (0): 0.58
Negative (1): 1.15
Neutral (2): 2.52


In [38]:

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [39]:

os.environ["WANDB_DISABLED"] = "true"  # Blocks wandb at system level

model = AutoModelForSequenceClassification.from_pretrained(
    "Davlan/afro-xlmr-large", num_labels=3
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=0,
    per_device_train_batch_size=4,  # Reduced from 8
    learning_rate=5e-6,  # Lowered from 2e-5
    warmup_steps=500,
    weight_decay=0.01,
    save_strategy= 'epoch',
    eval_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end= True,
    metric_for_best_model='accuracy',
    report_to="none"

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=0, training_loss=0.0, metrics={'train_runtime': 0.0022, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'total_flos': 0, 'train_loss': 0.0, 'epoch': 0})

In [40]:
model.save_pretrained("./sesotho_sentiment_model")
afro_xmlr_tokenizer.save_pretrained("./sesotho_sentiment_model")

('./sesotho_sentiment_model/tokenizer_config.json',
 './sesotho_sentiment_model/special_tokens_map.json',
 './sesotho_sentiment_model/sentencepiece.bpe.model',
 './sesotho_sentiment_model/added_tokens.json',
 './sesotho_sentiment_model/tokenizer.json')

In [41]:
# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# True labels
true_labels = test_labels.tolist()

KeyboardInterrupt: 

In [None]:
# Calculate prediction probabilities and identify uncertain samples
probs = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=-1)
probs_np = probs.numpy()  # Convert once for clarity
uncertain_samples = np.where(np.max(probs_np, axis=1) < 0.7)[0]  # Confidence <70%
label_map = {'positive': 0, 'negative': 1, 'neutral': 2}

# Inspect uncertain samples
print(f"Found {len(uncertain_samples)} low-confidence predictions:")
for idx in uncertain_samples[:5]:  # Print first 5 uncertain examples
    print(f"\nText: {test_texts.iloc[idx]}")
    print(f"True Label: {list(label_map.keys())[true_labels[idx]]}")
    print(f"Predicted: {list(label_map.keys())[preds[idx]]} (Confidence: {np.max(probs_np[idx]):.2f})")

In [None]:
# Classification Report (includes F1, precision, recall per class + accuracy)
print("Classification Report:")
print(classification_report(true_labels, preds, target_names=list(label_map.keys())))

# Individual Metrics
print(f"Accuracy: {accuracy_score(true_labels, preds):.4f}")
print(f"Weighted F1: {f1_score(true_labels, preds, average='weighted'):.4f}")
print(f"Macro F1: {f1_score(true_labels, preds, average='macro'):.4f}")

# Confusion Matrix
cm = confusion_matrix(true_labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_map.keys(),
            yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
plt.hist(np.max(probs.numpy(), axis=1), bins=20)
plt.xlabel("Confidence Score")
plt.ylabel("Count")
plt.title("Model Confidence Distribution")
plt.show()

In [None]:

# Binarize labels for ROC AUC (only for binary/multiclass)
if len(label_map) == 2:  # Binary
    roc_auc = roc_auc_score(true_labels, predictions.predictions[:, 1])
    print(f"ROC AUC: {roc_auc:.4f}")
else:  # Multiclass (One-vs-Rest)
    y_true_bin = label_binarize(true_labels, classes=[0, 1, 2])
    roc_auc = roc_auc_score(y_true_bin, predictions.predictions, multi_class='ovr')
    print(f"ROC AUC (One-vs-Rest): {roc_auc:.4f}")

In [None]:
# Plot ROC curves for each class (multiclass)
if len(label_map) > 2:
    y_true_bin = label_binarize(true_labels, classes=[0, 1, 2])
    fpr, tpr, roc_auc = {}, {}, {}

    for i, class_name in enumerate(label_map.keys()):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], predictions.predictions[:, i])
        roc_auc[i] = roc_auc_score(y_true_bin[:, i], predictions.predictions[:, i])

    plt.figure(figsize=(8, 6))
    for i in range(len(label_map)):
        plt.plot(fpr[i], tpr[i], label=f'{list(label_map.keys())[i]} (AUC = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multiclass ROC Curves')
    plt.legend()
    plt.show()

In [None]:

drive.mount('/content/drive')

model.save_pretrained("/content/drive/MyDrive/AfroXLMR-Sesotho")
afro_xmlr_tokenizer.save_pretrained("/content/drive/MyDrive/AfroXLMR-Sesotho")

# TF - IDF logistic regession model

In [None]:
X = sesotho_df['cleaned_sentence']
y = sesotho_df['Final_Label']

# Split data (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_df=0.9,
    min_df=5,
    ngram_range=(1, 2),   # unigrams + bigrams
    max_features=5000
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_tfidf, y_train)

In [None]:
y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, digits=4))

In [None]:


# Vectorize text
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# Train model
tfidf_model = LogisticRegression()
tfidf_model.fit(X_train_tfidf, y_train)

# Make predictions
tfidf_preds = tfidf_model.predict(X_test_tfidf)

In [None]:


# F1 scores from classification reports
tfidf_f1 = classification_report(y_test, tfidf_preds, output_dict=True)
transformer_f1 = classification_report(y_test, preds, output_dict=True)

# Get weighted average F1 for each
f1_tfidf = tfidf_f1["weighted avg"]["f1-score"]
f1_transformer = transformer_f1["weighted avg"]["f1-score"]

In [None]:
# Labels and scores
models = ['TF-IDF + Logistic Regression', 'AfroXLMR Transformer']
scores = [f1_tfidf, f1_transformer]

# Bar chart
plt.figure(figsize=(8, 5))
bars = plt.bar(models, scores, color=['skyblue', 'lightgreen'])
plt.ylim(0, 1)
plt.ylabel('Weighted F1 Score')
plt.title('Model Comparison: Transformer vs TF-IDF + Logistic Regression')

# Annotate bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f"{yval:.4f}", ha='center', va='bottom')

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

#Neural models

In [None]:
class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 128, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [None]:
def train_val_dataloader(encodings,data_labels):
  input_ids = encodings
  labels = data_labels

  dataset = TensorDataset(input_ids, labels)
  train_size = int(0.8 * len(dataset))
  val_size = len(dataset) - train_size

  train_ds, val_ds = random_split(dataset, [train_size, val_size])

  train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=32)

  return train_dl, val_dl

In [None]:
def train_eval_model(model, epochs,train_dl,val_dl):
    for epoch in range(epochs):
      model.train()
      total_loss = 0

      for xb, yb, in train_dl:
          preds = model(xb)
          loss = loss_fn(preds, yb)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    correct, total = 0, 0

    predictions = []
    true_labels = []

    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            predicted = torch.argmax(preds, dim=1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)

            predictions.extend(predicted)
            true_labels.extend(yb)


    print(f"Validation Accuracy: {correct / total:.2%}")

    return predictions, true_labels


In [None]:
def metrics(predictions, true_labels):

  class_names = ['negative','positive','neutral']
  print("\n--- Full Classification Report ---")
  print(classification_report(true_labels, predictions, target_names=class_names, zero_division=0))
  return classification_report(true_labels, predictions, target_names=class_names, zero_division=0)

## BPE model

In [None]:
model = TweetClassifier(vocab_size=bpe_tokenizer.vocab_size, embed_dim=128, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
train_dl, val_dl = train_val_dataloader(sesotho_bpe_encoding['input_ids'], torch.tensor(sesotho_df['Final_Label'].tolist()))
train_dl, val_dl = train_val_dataloader(sesotho_word_piece_encoding['input_ids'], torch.tensor(sesotho_df['Final_Label'].tolist()))

### evaluate model

In [None]:
predictions, true_labels = train_eval_model(model,0,train_dl,val_dl)

In [None]:
bpe_metrics = metrics(np.array(predictions),np.array(true_labels))

# STF-idf

In [None]:
class STFIDFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(STFIDFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        out = F.relu(self.fc1(x))
        return self.fc2(out)

In [None]:
sesotho_df

## N-gram

In [None]:

stfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(sesotho_df['sentence'])
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())

stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)

In [None]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [None]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

In [None]:
stf_idf_ngram_metrics = metrics(np.array(predictions),np.array(true_labels))

## BPE

In [None]:
subword_bpe_tokenised_documnets = [" ".join(s) for s in sesotho_df['bpe_tokens']]
subword_bpe_tokenised_documnets[0:3]

In [None]:
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(subword_bpe_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())

stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)


In [None]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [None]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=64, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

In [None]:
stf_idf_bpe_metrics = metrics(np.array(predictions),np.array(true_labels))

## Word piece

In [None]:
subword_word_piece_tokenised_documnets = [" ".join(s) for s in sesotho_df['word_piece_tokens']]
subword_word_piece_tokenised_documnets[0:3]

In [None]:
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(subword_word_piece_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())

stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)


In [None]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [None]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=64, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

In [None]:
stf_idf_word_piece_metrics = metrics(np.array(predictions),np.array(true_labels))

# Large Classifier

In [None]:
class LargeSTFIDFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LargeSTFIDFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.fc5 = nn.Linear(hidden_dim, hidden_dim)
        self.fc6 = nn.Linear(hidden_dim, hidden_dim)
        self.fc7 = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = F.relu(self.fc4(out))
        out = F.relu(self.fc5(out))
        out = F.relu(self.fc6(out))
        return self.fc7(out)

In [None]:
subword_bpe_tokenised_documnets = [" ".join(s) for s in sesotho_df['bpe_tokens']]

In [None]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(subword_bpe_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())


# Convert to PyTorch tensor
stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)

In [None]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [None]:
dims = [128,256]
epochs = [10,15,20]
lr = [1e-3, 1e-4, 3e-4]

for dim in dims:
  for epoch in epochs:
    for l_r in lr:

      print(f"\ndim: {dim}, epochs: {epoch}, lr: {l_r}")

      model = LargeSTFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=dim, output_dim=3)
      loss_fn = nn.CrossEntropyLoss()
      optimizer = torch.optim.Adam(model.parameters(), lr=l_r)

      predictions, true_labels = train_eval_model(model,epoch,train_dl,val_dl)
      metrics(np.array(predictions),np.array(true_labels))


dim: 128, epochs: 10, lr: 0.001
Epoch 1, Loss: 118.0134
Epoch 2, Loss: 82.7359
Epoch 3, Loss: 55.5098
Epoch 4, Loss: 39.4226
Epoch 5, Loss: 23.6927
Epoch 6, Loss: 13.2957
Epoch 7, Loss: 9.5988
Epoch 8, Loss: 6.5736
Epoch 9, Loss: 4.7462
Epoch 10, Loss: 4.1134
Validation Accuracy: 58.73%

--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.70      0.72      0.71       591
    positive       0.45      0.40      0.42       304
     neutral       0.41      0.44      0.42       147

    accuracy                           0.59      1042
   macro avg       0.52      0.52      0.52      1042
weighted avg       0.58      0.59      0.58      1042


dim: 128, epochs: 10, lr: 0.0001
Epoch 1, Loss: 137.5965
Epoch 2, Loss: 118.5420
Epoch 3, Loss: 101.5973
Epoch 4, Loss: 78.7057
Epoch 5, Loss: 56.2851
Epoch 6, Loss: 39.9482
Epoch 7, Loss: 27.1917
Epoch 8, Loss: 18.5134
Epoch 9, Loss: 13.3887
Epoch 10, Loss: 11.2128
Validation Accuracy: 59.31