#download data set

In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
sesotho_tweets_url = 'https://raw.githubusercontent.com/Khotso-Bore/neural-languange-model/refs/heads/main/sesotho_tweets.csv'
sesotho_headlines_url = 'https://raw.githubusercontent.com/Khotso-Bore/neural-languange-model/refs/heads/main/Transformed_NewsSA_Dataset.csv'

sesotho_tweets_df = pd.read_csv(sesotho_tweets_url)
sesotho_headlines_df = pd.read_csv(sesotho_headlines_url)

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
sesotho_tweets_df['cleaned_sentence'] = sesotho_tweets_df['sentence'].apply(clean_text).astype(str)
sesotho_headlines_df['cleaned_sentence'] = sesotho_headlines_df['sentence'].apply(clean_text).astype(str)

#tokenize dataset

In [6]:
sesotho_tweets_df.head()

Unnamed: 0,sentence,Final_labels,predict_name,cleaned_sentence
0,@user gwa tshwana rena ba bang a re kreye selo...,negative,Sesotho,user gwa tshwana rena ba bang a re kreye selo mos
1,e bata goal spurs,neutral,Sesotho,e bata goal spurs
2,@user @user ke nahana taba eno ea ho batla ho ...,negative,multi,user user ke nahana taba eno ea ho batla ho kh...
3,@user lotho hle empa fela ke ipotela,positive,Sesotho,user lotho hle empa fela ke ipotela
4,@user @user keu utloa hantle,positive,Sesotho,user user keu utloa hantle


In [7]:
sesotho_headlines_df.head()

Unnamed: 0,sentence,label,cleaned_sentence
0,BASUOE BA QOSUOE KA PELAELO EA HO BOLAEA MOSHE...,-1,basuoe ba qosuoe ka pelaelo ea ho bolaea moshe...
1,TSEBA MARENA A SEHLOOHO A NAHA,0,tseba marena a sehlooho a naha
2,LINTLHA-KHOLO MABAPI LE NTLO E OETSENG BATHO,-1,lintlhakholo mabapi le ntlo e oetseng batho
3,MOTHO O KHAOTSOE BOTONA LE MENOANA A NTSE A PHELA,-1,motho o khaotsoe botona le menoana a ntse a phela
4,LITABA TSE BOHLOKO HO MAQHEKU LE MAQHEKOANA NA...,-1,litaba tse bohloko ho maqheku le maqhekoana na...


In [8]:
sesotho_headlines_df['label'] = sesotho_headlines_df['label'].str.replace(r'[a-zA-Z]', '', regex=True)
sesotho_headlines_df['label'] = sesotho_headlines_df['label'].astype(int)
sesotho_headlines_df['label'] = np.where(sesotho_headlines_df['label'] == 0, 2, sesotho_headlines_df['label'])
sesotho_headlines_df['label'] = np.where(sesotho_headlines_df['label'] == -1, 0, sesotho_headlines_df['label'])
sesotho_headlines_df.head()

Unnamed: 0,sentence,label,cleaned_sentence
0,BASUOE BA QOSUOE KA PELAELO EA HO BOLAEA MOSHE...,0,basuoe ba qosuoe ka pelaelo ea ho bolaea moshe...
1,TSEBA MARENA A SEHLOOHO A NAHA,2,tseba marena a sehlooho a naha
2,LINTLHA-KHOLO MABAPI LE NTLO E OETSENG BATHO,0,lintlhakholo mabapi le ntlo e oetseng batho
3,MOTHO O KHAOTSOE BOTONA LE MENOANA A NTSE A PHELA,0,motho o khaotsoe botona le menoana a ntse a phela
4,LITABA TSE BOHLOKO HO MAQHEKU LE MAQHEKOANA NA...,0,litaba tse bohloko ho maqheku le maqhekoana na...


In [9]:
sesotho_headlines_df.rename(columns={'label': 'Final_Label'}, inplace=True)
sesotho_tweets_df.rename(columns={'Final_labels': 'Final_Label'}, inplace=True)

In [10]:
def label_to_int(x):
  if x == 'positive':
    return 1
  elif x == 'negative':
    return 0
  else:
    return 2

In [11]:
label_map = pd.DataFrame({
    'label': ['negative', 'positive', 'neutral'],
    'key': [0, 1, 2]
})

In [12]:
value_counts = sesotho_tweets_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['label', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='label', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)

In [13]:
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [14]:
value_counts = sesotho_headlines_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1552
1,positive,1,551
2,neutral,2,106


In [15]:

sesotho_tweets_df['Final_Label'] = sesotho_tweets_df['Final_Label'].apply(label_to_int).astype(int)

In [16]:
value_counts = sesotho_tweets_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1464
1,positive,1,953
2,neutral,2,583


In [17]:
sesotho_tweets_df.dtypes

Unnamed: 0,0
sentence,object
Final_Label,int64
predict_name,object
cleaned_sentence,object


In [18]:
value_counts = sesotho_headlines_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,1552
1,positive,1,551
2,neutral,2,106


In [19]:
sesotho_headlines_df.dtypes

Unnamed: 0,0
sentence,object
Final_Label,int64
cleaned_sentence,object


In [20]:
sesotho_df = pd.concat([sesotho_tweets_df, sesotho_headlines_df],axis=0,ignore_index=True)
sesotho_df['Final_Label'] = sesotho_df['Final_Label'].astype(int)

In [21]:
sesotho_df.columns

Index(['sentence', 'Final_Label', 'predict_name', 'cleaned_sentence'], dtype='object')

In [22]:
sesotho_df.shape

(5209, 4)

In [23]:
sesotho_df['Final_Label'].value_counts()

Unnamed: 0_level_0,count
Final_Label,Unnamed: 1_level_1
0,3016
1,1504
2,689


In [24]:
sesotho_df['cleaned_sentence'].shape

(5209,)

In [25]:
value_counts = sesotho_df['Final_Label'].value_counts().reset_index()
value_counts.columns = ['key', 'count']
value_counts_keys = pd.merge(label_map, value_counts, on='key', how='left').fillna(0)
value_counts_keys['count'] = value_counts_keys['count'].astype(int)
value_counts_keys

Unnamed: 0,label,key,count
0,negative,0,3016
1,positive,1,1504
2,neutral,2,689


In [26]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,cleaned_sentence
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,user gwa tshwana rena ba bang a re kreye selo mos
1,e bata goal spurs,2,Sesotho,e bata goal spurs
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,user user ke nahana taba eno ea ho batla ho kh...
3,@user lotho hle empa fela ke ipotela,1,Sesotho,user lotho hle empa fela ke ipotela
4,@user @user keu utloa hantle,1,Sesotho,user user keu utloa hantle


#tokenize using subword tokenization

In [27]:
from transformers import XLMRobertaTokenizer, AutoTokenizer

In [28]:
bpe_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
word_piece_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
#setswana_df['tokens_bpe'] = setswana_df['sentence'].apply(lambda x: tokenizer.tokenize(x))
sesotho_df['bpe_tokens'] = sesotho_df['cleaned_sentence'].apply(lambda x: bpe_tokenizer.tokenize(x))
sesotho_df['word_piece_tokens'] = sesotho_df['cleaned_sentence'].apply(lambda x: word_piece_tokenizer.tokenize(x))

In [30]:
sesotho_df.head()

Unnamed: 0,sentence,Final_Label,predict_name,cleaned_sentence,bpe_tokens,word_piece_tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,user gwa tshwana rena ba bang a re kreye selo mos,"[▁user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁bang,...","[user, g, ##wa, ts, ##hwa, ##na, ren, ##a, ba,..."
1,e bata goal spurs,2,Sesotho,e bata goal spurs,"[▁e, ▁bata, ▁goal, ▁, spur, s]","[e, bat, ##a, goal, spurs]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,user user ke nahana taba eno ea ho batla ho kh...,"[▁user, ▁user, ▁ke, ▁na, hana, ▁taba, ▁en, o, ...","[user, user, ke, nah, ##ana, tab, ##a, en, ##o..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,user lotho hle empa fela ke ipotela,"[▁user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a, ▁k...","[user, lot, ##ho, h, ##le, em, ##pa, fe, ##la,..."
4,@user @user keu utloa hantle,1,Sesotho,user user keu utloa hantle,"[▁user, ▁user, ▁ke, u, ▁ut, lo, a, ▁han, tle]","[user, user, ke, ##u, ut, ##lo, ##a, han, ##tle]"


#process tokens

In [31]:

sesotho_bpe_encoding = bpe_tokenizer(sesotho_df['cleaned_sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)
sesotho_word_piece_encoding = word_piece_tokenizer(sesotho_df['cleaned_sentence'].tolist(), padding=True, truncation=True, return_tensors='pt',max_length=64)

In [32]:
sesotho_bpe_encoding

{'input_ids': tensor([[    0, 38937, 39305,  ...,     1,     1,     1],
        [    0,    28,  8336,  ...,     1,     1,     1],
        [    0, 38937, 38937,  ...,   497,    10,     2],
        ...,
        [    0, 97549,    39,  ...,     1,     1,     1],
        [    0, 30078,  2590,  ...,     1,     1,     1],
        [    0, 22711,    28,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [33]:
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import torch
from sklearn.model_selection import train_test_split



In [34]:
def train_val_dataloader(encodings,data_labels):
  input_ids = encodings
  labels = data_labels

  dataset = TensorDataset(input_ids, labels)
  train_size = int(0.8 * len(dataset))
  val_size = len(dataset) - train_size

  train_ds, val_ds = random_split(dataset, [train_size, val_size])

  train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=32)

  return train_dl, val_dl

#Create neural model

In [35]:
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(TweetClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * 64, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)              # (batch_size, seq_len, embed_dim)
        flat = embedded.view(x.size(0), -1)       # Flatten: (batch_size, seq_len * embed_dim)
        out = F.relu(self.fc1(flat))              # (batch_size, hidden_dim)
        return self.fc2(out)                      # (batch_size, output_dim)


In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report


In [37]:
def train_eval_model(model, epochs,train_dl,val_dl):
    for epoch in range(epochs):
      model.train()
      total_loss = 0

      for xb, yb, in train_dl:
          preds = model(xb)
          loss = loss_fn(preds, yb)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    model.eval()
    correct, total = 0, 0

    predictions = []
    true_labels = []

    with torch.no_grad():
        for xb, yb in val_dl:
            preds = model(xb)
            predicted = torch.argmax(preds, dim=1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)

            predictions.extend(predicted)
            true_labels.extend(yb)


    print(f"Validation Accuracy: {correct / total:.2%}")

    return predictions, true_labels


In [38]:
def metrics(predictions, true_labels):

  class_names = ['negative','positive','neutral']
  print("\n--- Full Classification Report ---")
  print(classification_report(true_labels, predictions, target_names=class_names, zero_division=0))

## BPE model

In [39]:
model = TweetClassifier(vocab_size=bpe_tokenizer.vocab_size, embed_dim=64, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [40]:
train_dl, val_dl = train_val_dataloader(sesotho_bpe_encoding['input_ids'], torch.tensor(sesotho_df['Final_Label'].tolist()))

### evaluate model

In [41]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 123.1979
Epoch 2, Loss: 105.1231
Epoch 3, Loss: 97.4714
Epoch 4, Loss: 84.1188
Epoch 5, Loss: 71.7163
Epoch 6, Loss: 61.2870
Epoch 7, Loss: 48.2939
Epoch 8, Loss: 41.1225
Epoch 9, Loss: 32.8165
Epoch 10, Loss: 26.2431
Validation Accuracy: 54.61%


In [42]:
metrics(np.array(predictions),np.array(true_labels))


--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.70      0.67      0.68       606
    positive       0.36      0.42      0.39       287
     neutral       0.34      0.31      0.32       149

    accuracy                           0.55      1042
   macro avg       0.47      0.46      0.46      1042
weighted avg       0.56      0.55      0.55      1042



# STF-idf

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

In [44]:
class STFIDFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(STFIDFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        out = F.relu(self.fc1(x))
        return self.fc2(out)

In [45]:
sesotho_df

Unnamed: 0,sentence,Final_Label,predict_name,cleaned_sentence,bpe_tokens,word_piece_tokens
0,@user gwa tshwana rena ba bang a re kreye selo...,0,Sesotho,user gwa tshwana rena ba bang a re kreye selo mos,"[▁user, ▁gwa, ▁t, shwa, na, ▁rena, ▁ba, ▁bang,...","[user, g, ##wa, ts, ##hwa, ##na, ren, ##a, ba,..."
1,e bata goal spurs,2,Sesotho,e bata goal spurs,"[▁e, ▁bata, ▁goal, ▁, spur, s]","[e, bat, ##a, goal, spurs]"
2,@user @user ke nahana taba eno ea ho batla ho ...,0,multi,user user ke nahana taba eno ea ho batla ho kh...,"[▁user, ▁user, ▁ke, ▁na, hana, ▁taba, ▁en, o, ...","[user, user, ke, nah, ##ana, tab, ##a, en, ##o..."
3,@user lotho hle empa fela ke ipotela,1,Sesotho,user lotho hle empa fela ke ipotela,"[▁user, ▁lot, ho, ▁, hle, ▁emp, a, ▁fel, a, ▁k...","[user, lot, ##ho, h, ##le, em, ##pa, fe, ##la,..."
4,@user @user keu utloa hantle,1,Sesotho,user user keu utloa hantle,"[▁user, ▁user, ▁ke, u, ▁ut, lo, a, ▁han, tle]","[user, user, ke, ##u, ut, ##lo, ##a, han, ##tle]"
...,...,...,...,...,...,...
5204,KOMPONE E NEHELANA KA KOLOI HO SEPOLESA,1,,kompone e nehelana ka koloi ho sepolesa,"[▁kompon, e, ▁e, ▁ne, hela, na, ▁ka, ▁kolo, i,...","[ko, ##mp, ##one, e, ne, ##hel, ##ana, ka, ko,..."
5205,MOSEBETSI OA HO NEHELANA LE HO BULA SEMMUSO MO...,1,,mosebetsi oa ho nehelana le ho bula semmuso mo...,"[▁mos, e, bet, si, ▁o, a, ▁ho, ▁ne, hela, na, ...","[mo, ##se, ##bet, ##si, o, ##a, ho, ne, ##hel,..."
5206,MOHLOMPHEHI TONAKHOLO O FIHLA SETEREKENG MOO A...,1,,mohlomphehi tonakholo o fihla seterekeng moo a...,"[▁mohlo, m, phe, hi, ▁tona, kho, lo, ▁o, ▁fi, ...","[mo, ##hl, ##omp, ##he, ##hi, ton, ##akh, ##ol..."
5207,KETEKELO EA KHOELI EA SEPOLESA,1,,ketekelo ea khoeli ea sepolesa,"[▁kete, kel, o, ▁ea, ▁kho, eli, ▁ea, ▁se, pole...","[ke, ##tek, ##elo, ea, k, ##hoe, ##li, ea, sep..."


## N-gram

In [46]:

stfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(sesotho_df['sentence'])
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())

stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)

torch.Size([5209, 22255])
22255


In [47]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [48]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=128, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [49]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 116.5435
Epoch 2, Loss: 84.3772
Epoch 3, Loss: 54.3893
Epoch 4, Loss: 33.1206
Epoch 5, Loss: 20.1792
Epoch 6, Loss: 13.0068
Epoch 7, Loss: 8.7058
Epoch 8, Loss: 5.9769
Epoch 9, Loss: 4.3239
Epoch 10, Loss: 3.2497
Validation Accuracy: 63.15%


In [50]:
metrics(np.array(predictions),np.array(true_labels))


--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.71      0.80      0.75       604
    positive       0.48      0.44      0.46       291
     neutral       0.48      0.30      0.37       147

    accuracy                           0.63      1042
   macro avg       0.56      0.52      0.53      1042
weighted avg       0.61      0.63      0.62      1042



## BPE

In [51]:
subword_bpe_tokenised_documnets = [" ".join(s) for s in sesotho_df['bpe_tokens']]
subword_bpe_tokenised_documnets[0:3]

['▁user ▁gwa ▁t shwa na ▁rena ▁ba ▁bang ▁a ▁re ▁kre ye ▁se lo ▁mos',
 '▁e ▁bata ▁goal ▁ spur s',
 '▁user ▁user ▁ke ▁na hana ▁taba ▁en o ▁ea ▁ho ▁bat la ▁ho ▁khe tha ▁ho bane ▁re ▁she bile ▁our ▁own ▁benefits ▁re ▁le ▁bat ho ▁ke ▁e ona ▁e ▁sent seng ▁polo tik i ▁ea ▁les ot ho ▁so ▁we ▁went ▁behind ▁as sho les ▁for ▁years ▁co z ▁re ▁she bile ▁mele mo ▁e ▁direct ▁rele ▁bat ho ▁a ▁ee a ▁sebe let sa ▁n']

In [52]:
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(subword_bpe_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())

stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)
print(stf_idf_tensor.shape)
print(stf_idf_vocab_size)


torch.Size([5209, 11834])
11834


In [53]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [54]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=64, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [55]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 124.5235
Epoch 2, Loss: 97.6912
Epoch 3, Loss: 68.3571
Epoch 4, Loss: 44.2495
Epoch 5, Loss: 27.8957
Epoch 6, Loss: 18.1000
Epoch 7, Loss: 12.3858
Epoch 8, Loss: 8.7986
Epoch 9, Loss: 6.6107
Epoch 10, Loss: 5.2260
Validation Accuracy: 63.63%


In [56]:
metrics(np.array(predictions),np.array(true_labels))


--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.71      0.80      0.76       604
    positive       0.51      0.43      0.46       312
     neutral       0.45      0.36      0.40       126

    accuracy                           0.64      1042
   macro avg       0.56      0.53      0.54      1042
weighted avg       0.62      0.64      0.62      1042



## Word piece

In [57]:
subword_word_piece_tokenised_documnets = [" ".join(s) for s in sesotho_df['word_piece_tokens']]
subword_word_piece_tokenised_documnets[0:3]

['user g ##wa ts ##hwa ##na ren ##a ba bang a re k ##rey ##e se ##lo mo ##s',
 'e bat ##a goal spurs',
 'user user ke nah ##ana tab ##a en ##o ea ho bat ##la ho k ##het ##ha ho ##bane re she ##bil ##e our own benefits re le bath ##o ke e ##ona e sent ##sen ##g polo ##ti ##ki ea les ##otho so we went behind asshole ##s for years co ##z re she ##bil ##e mel ##em ##o e direct re ##le bath ##o a ee ##a se ##bel ##ets ##a n']

In [58]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [59]:
model = STFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=64, output_dim=3)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [60]:
predictions, true_labels = train_eval_model(model,10,train_dl,val_dl)

Epoch 1, Loss: 127.4507
Epoch 2, Loss: 98.6216
Epoch 3, Loss: 67.7316
Epoch 4, Loss: 43.1112
Epoch 5, Loss: 27.0854
Epoch 6, Loss: 17.3940
Epoch 7, Loss: 11.7270
Epoch 8, Loss: 8.2733
Epoch 9, Loss: 6.1526
Epoch 10, Loss: 4.7472
Validation Accuracy: 66.03%


In [61]:
metrics(np.array(predictions),np.array(true_labels))


--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.74      0.82      0.78       636
    positive       0.49      0.44      0.47       267
     neutral       0.51      0.36      0.42       139

    accuracy                           0.66      1042
   macro avg       0.58      0.54      0.55      1042
weighted avg       0.64      0.66      0.65      1042



# Large Classifier

In [62]:
class LargeSTFIDFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LargeSTFIDFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.fc5 = nn.Linear(hidden_dim, hidden_dim)
        self.fc6 = nn.Linear(hidden_dim, hidden_dim)
        self.fc7 = nn.Linear(hidden_dim, output_dim)


    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = F.relu(self.fc4(out))
        out = F.relu(self.fc5(out))
        out = F.relu(self.fc6(out))
        return self.fc7(out)

In [63]:
subword_bpe_tokenised_documnets = [" ".join(s) for s in sesotho_df['bpe_tokens']]

In [64]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))
stfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
stf_idf_matrix = stfidf_vectorizer.fit_transform(subword_bpe_tokenised_documnets)
stf_idf_vocab_size = len(stfidf_vectorizer.get_feature_names_out())


# Convert to PyTorch tensor
stf_idf_tensor = torch.tensor(stf_idf_matrix.toarray(), dtype=torch.float32)

In [65]:
train_dl, val_dl = train_val_dataloader(stf_idf_tensor, torch.tensor(sesotho_df['Final_Label'].tolist()))

In [66]:
dims = [128,256]
epochs = [10,15,20]
lr = [1e-3, 1e-4, 3e-4]

for dim in dims:
  for epoch in epochs:
    for l_r in lr:

      print(f"\ndim: {dim}, epochs: {epoch}, lr: {l_r}")

      model = LargeSTFIDFClassifier(input_dim=stf_idf_vocab_size, hidden_dim=dim, output_dim=3)
      loss_fn = nn.CrossEntropyLoss()
      optimizer = torch.optim.Adam(model.parameters(), lr=l_r)

      predictions, true_labels = train_eval_model(model,epoch,train_dl,val_dl)
      metrics(np.array(predictions),np.array(true_labels))


dim: 128, epochs: 10, lr: 0.001
Epoch 1, Loss: 117.3561
Epoch 2, Loss: 81.5181
Epoch 3, Loss: 55.2805
Epoch 4, Loss: 41.6384
Epoch 5, Loss: 22.7981
Epoch 6, Loss: 11.6716
Epoch 7, Loss: 7.0045
Epoch 8, Loss: 6.2383
Epoch 9, Loss: 5.3585
Epoch 10, Loss: 4.6864
Validation Accuracy: 61.90%

--- Full Classification Report ---
              precision    recall  f1-score   support

    negative       0.77      0.69      0.73       614
    positive       0.49      0.57      0.52       295
     neutral       0.36      0.41      0.38       133

    accuracy                           0.62      1042
   macro avg       0.54      0.55      0.55      1042
weighted avg       0.64      0.62      0.63      1042


dim: 128, epochs: 10, lr: 0.0001
Epoch 1, Loss: 134.3566
Epoch 2, Loss: 120.1593
Epoch 3, Loss: 101.4915
Epoch 4, Loss: 80.5174
Epoch 5, Loss: 60.5212
Epoch 6, Loss: 45.3582
Epoch 7, Loss: 34.0411
Epoch 8, Loss: 23.4612
Epoch 9, Loss: 17.3232
Epoch 10, Loss: 14.4486
Validation Accuracy: 62.00