In [1]:
import mlflow
import torch
import numpy as np
import optuna
import os
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler,PreTrainedTokenizerBase
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,precision_score, recall_score

In [2]:
# --- Configuration ---
MODEL_CHECKPOINT = "distilbert-base-uncased"
DATASET_NAME = "saheedniyi/naijaweb"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5

In [3]:
raw_dataset = load_dataset(DATASET_NAME)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'link', 'token_count', 'section', 'int_score', 'language', 'language_probability'],
        num_rows: 270137
    })
})

In [4]:
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'link', 'token_count', 'section', 'int_score', 'language', 'language_probability'],
        num_rows: 270137
    })
})


In [5]:
# load dataset as a dataframe

raw_data = raw_dataset['train']
posts_dataframe = raw_data.to_pandas()
posts_dataframe = posts_dataframe[['text','section']]
posts_dataframe.head()

Unnamed: 0,text,section
0,Governor Samuel Ortom of Benue State\nBy Peter...,Politics
1,NewsHelm.com offers a unique blend of modernit...,Politics
2,BY LEVINUS NWABUGHIOGU & EMMAN OVUAKPORIE\nABU...,Politics
3,Eyitayo Jegede and Jimoh Ibrahim\nBy Ikechukwu...,Politics
4,"Mrs. Helen Mark, wife of Senator David Mark, p...",Politics


In [6]:
# check the categories.

posts_dataframe['section'].unique()

array(['Politics', 'Sports', 'Music-Radio', 'Travel', 'Phones', 'Romance',
       'Family', 'Business', 'Education', 'Religion', 'TV-Movies',
       'Health', 'Food', 'Career', 'Car Talk', 'Culture', 'Programming',
       'Fashion', 'Literature', None], dtype=object)

In [7]:
posts_dataframe['section'].value_counts()

section
Politics       131126
Sports          20739
Business        15478
Education       12084
Religion        11755
Health          11637
Romance          6131
Travel           5857
Phones           5230
Family           4023
Career           3107
Car Talk         2800
Fashion          2469
Programming      2263
Music-Radio      2221
TV-Movies        2191
Culture          1968
Food             1607
Literature        808
Name: count, dtype: int64

In [None]:
# merge some categories. ['Phone', 'Programming'],['Music-Radi]

relationships = ['Romance', 'Family']
lifestyle = ['Travel', 'Food', 'Fashion','Career']
technology = ['Phones', 'Programming','Car Talk']
entertainment = ['Music-Radio', 'TV-Movies', 'Culture', 'Literature']

# create a new section.
posts_dataframe['section'] = ['Relationship' if x in relationships else
                            'Lifestyle' if x in lifestyle else
                            'Technology' if x in technology else
                            'Entertainment' if x in entertainment else
                            x for x in posts_dataframe['section']]

posts_dataframe['section'].value_counts()


section
Politics         131126
Sports            20739
Business          15478
Lifestyle         13040
Education         12084
Religion          11755
Health            11637
Technology        10293
Relationship      10154
Entertainment      7188
Name: count, dtype: int64

In [9]:
# inspect categories with None
posts_dataframe.isna().sum()

text           0
section    26643
dtype: int64

In [10]:
# drop the missing values, reset index and see how that affects the final dataset.

posts_dataframe.dropna(inplace=True)
posts_dataframe.reset_index(drop=True, inplace=True)


posts_dataframe['section'].value_counts()

section
Politics         131126
Sports            20739
Business          15478
Lifestyle         13040
Education         12084
Religion          11755
Health            11637
Technology        10293
Relationship      10154
Entertainment      7188
Name: count, dtype: int64

In [None]:
# create a label column by label encoding the section column

new_categories = list(posts_dataframe['section'].unique())
categories_map = {category:idx for idx, category in enumerate(new_categories)}
posts_dataframe['label'] = posts_dataframe['section'].map(categories_map)
posts_dataframe.head()

Unnamed: 0,text,section,label
0,Governor Samuel Ortom of Benue State\nBy Peter...,Politics,0
1,NewsHelm.com offers a unique blend of modernit...,Politics,0
2,BY LEVINUS NWABUGHIOGU & EMMAN OVUAKPORIE\nABU...,Politics,0
3,Eyitayo Jegede and Jimoh Ibrahim\nBy Ikechukwu...,Politics,0
4,"Mrs. Helen Mark, wife of Senator David Mark, p...",Politics,0


In [14]:
posts_dataframe['label'].value_counts()

label
0    131126
1     20739
6     15478
3     13040
7     12084
8     11755
9     11637
4     10293
5     10154
2      7188
Name: count, dtype: int64

In [15]:
## reduce the dataset size so each class has 7188 samples for balance.

target_count = 7188

posts_dataframe = (
    posts_dataframe.groupby('label')
    .apply(lambda x: x.sample(n=target_count, random_state=42))
    .reset_index(drop=True)
)

In [17]:
posts_dataframe['label'].value_counts()

label
0    7188
1    7188
2    7188
3    7188
4    7188
5    7188
6    7188
7    7188
8    7188
9    7188
Name: count, dtype: int64

In [18]:
# get tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [19]:
# prepare the dataset

class TextClassificationDataset(Dataset):
    def __init__(self, tokenizer:PreTrainedTokenizerBase,
                 dataframe: pd.DataFrame, max_length:int = 256) -> None:
      self.texts = dataframe['text'].tolist()
      self.labels = dataframe['label'].tolist()
      self.tokenizer = tokenizer
      self.max_length = max_length

    def __len__(self) -> int:
      return len(self.labels)

    def __getitem__(self, idx) -> dict:
      text = self.texts[idx]
      label = self.labels[idx]

      encoding = self.tokenizer.encode_plus(
          text = text,
          max_length = self.max_length,
          truncation= True,
          padding = 'max_length',
          return_tensors = 'pt',
          add_special_tokens= True
      )
      input_ids = encoding['input_ids'].flatten()
      attention_mask = encoding['attention_mask'].flatten()

      return {
          'input_ids': input_ids,
          'attention_mask': attention_mask,
          'label': torch.tensor(data=label, dtype=torch.long)
      }



In [15]:
#posts_dataframe = posts_dataframe.head(100)
#posts_dataframe.shape

In [20]:
train_data, test_data = train_test_split(posts_dataframe, test_size=0.2,
                                         random_state=23, shuffle=True)
train_dataset = TextClassificationDataset(tokenizer=tokenizer, dataframe=train_data)
test_dataset = TextClassificationDataset(tokenizer=tokenizer, dataframe=test_data)

In [21]:
train_dataset[28]

{'input_ids': tensor([  101,  2005,  2055,  1002,  4278,  1010,  2199,  1010,  2017,  2071,
          2380,  1037,  4435,  2047, 12559, 21759, 20535, 13642, 12380,  7983,
          1999,  2115,  7381,  2030, 11202,  1012,  2030,  2065,  2017,  2215,
          2000,  5247,  2019,  4469,  1002,  1021,  2454,  1010,  2017,  2071,
          2380,  2023,  2944,  1997,  2028,  1999,  2115,  7939,  1012,  2328,
          2011,  2728, 26546,  2368,  3330,  1010,  1996,  2944,  2001,  3728,
          3591,  2012,  1996,  9780,  8285,  2265,  1012,  2429,  2000,  1996,
          2194,  1521,  1055,  4037,  1010, 12559, 21759, 20535,  2001,  2019,
          3733,  3601,  2138,  1996,  8285,  8571,  1521,  1055,  3765,  2024,
          1523,  7262,  1010, 18753,  1998, 10368,  1010,  1998,  4895, 23738,
         11905,  6321,  3059,  1012,  1524,  4189,  2438,  1010,  2021,  2025,
          2130,  2019, 13642, 12380,  7983,  2038,  2419, 18167,  2081,  2041,
          1997,  2613, 11719,  1012,  1

In [22]:
# --- Configuration ---
MODEL_CHECKPOINT = "distilbert-base-uncased"
DATASET_NAME = "saheedniyi/naijaweb"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
EPOCHS = 5
LEARNING_RATE = 2e-5

In [24]:
DEVICE

'cuda'

In [25]:
#setup dataloader

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [26]:
# setup the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=posts_dataframe['label'].nunique())
model = model.to(DEVICE)
optimizer = AdamW(params=model.parameters(), lr = LEARNING_RATE)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from typing import Tuple


# define the training loop
def training_loop(model: AutoModelForSequenceClassification,
                  train_loader:DataLoader,epoch: int) -> Tuple[float]:
  model.train()
  epoch_loss,epoch_labels, epoch_preds = [],[],[]
  train_batch = tqdm(train_loader, desc=f"Training epoch {epoch}/{EPOCHS}")
  for batch in train_batch:
    input_ids, attention_mask, label = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE), batch['label'].to(DEVICE)
    output = model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
    batch_loss = output.loss
    logits = output.logits

    # back prop
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()

    #update the loss
    epoch_loss.append(batch_loss.item())
    epoch_labels.extend(label.cpu().detach().numpy())
    # convert logits to preds and update the preds list
    _,preds = torch.max(logits, 1)
    epoch_preds.extend(preds.cpu().detach().numpy())

  # compute f1_score, precision, recall and accuracy.
  accuracy = accuracy_score(y_true=epoch_labels, y_pred=epoch_preds)
  precision = precision_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted", zero_division=0.0)
  recall = recall_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted")
  f1 = f1_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted")
  loss_score = sum(epoch_loss)/len(epoch_loss)

  return accuracy,precision,recall,f1,loss_score

In [28]:
# define the validation loop
def test_loop(model: AutoModelForSequenceClassification,
                  test_loader:DataLoader,epoch: int) -> Tuple[float]:
  model.eval()
  epoch_loss,epoch_labels, epoch_preds = [],[],[]
  with torch.no_grad():
    test_batch = tqdm(test_loader, desc=f"Testing epoch {epoch}/{EPOCHS}")
    for batch in test_batch:
      input_ids, attention_mask, label = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE), batch['label'].to(DEVICE)
      output = model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
      batch_loss = output.loss
      logits = output.logits

      #update the loss
      epoch_loss.append(batch_loss.item())
      epoch_labels.extend(label.cpu().detach().numpy())
      # convert logits to preds and update the preds list
      _,preds = torch.max(logits, 1)
      epoch_preds.extend(preds.cpu().detach().numpy())
  # compute f1_score, precision, recall and accuracy.
  accuracy = accuracy_score(y_true=epoch_labels, y_pred=epoch_preds)
  precision = precision_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted", zero_division=0.0)
  recall = recall_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted")
  f1 = f1_score(y_true=epoch_labels, y_pred=epoch_preds, average="weighted")
  loss_score = sum(epoch_loss)/len(epoch_loss)

  return accuracy,precision,recall,f1,loss_score


In [29]:
# setup the training and test loop.

def full_training_loop(epochs:int = EPOCHS) -> Tuple[dict]:
  mlflow.set_experiment("Naija Web Custom Classifier.")
  with mlflow.start_run(run_name="sanity check") as run:
    mlflow.log_params(params = {
        "model checkpoint": MODEL_CHECKPOINT,
        "dataset name": DATASET_NAME,
        'device': DEVICE,
        'batch size': BATCH_SIZE,
        'epochs': EPOCHS,
        'learning_rate': LEARNING_RATE
    })
    for epoch in range(EPOCHS):
      epoch += 1
      print("=" * 50)
      accuracy,precision,recall,f1,loss_score = training_loop(model=model,
                                                              train_loader=train_loader,epoch=epoch)
      train_metrics = {
          'train_accuracy':accuracy,
          'train_precision': precision,
          'train_f1': f1,
          'train_loss': loss_score
      }
      # begin evaluation
      accuracy,precision,recall,f1,loss_score = test_loop(model=model,
                                                              test_loader=train_loader,epoch=epoch)
      test_metrics = {
          'test_accuracy':accuracy,
          'test_precision': precision,
          'test_f1': f1,
          'test_loss': loss_score
      }
      train_metrics.update(test_metrics)
      mlflow.log_metrics(metrics=train_metrics, step=epoch)
      print(f"Completed training and testing for epoch {epoch}/{EPOCHS}:\n Train Metrics = {train_metrics}. \n Test Metrics: {test_metrics}")

  # log the final model
  mlflow.pytorch.log_model(pytorch_model=model, artifact_path="model")
  return train_metrics, test_metrics


In [30]:
full_training_loop()



Training epoch 1/5:   0%|          | 0/899 [00:00<?, ?it/s]

Training epoch 1/5: 100%|██████████| 899/899 [11:51<00:00,  1.26it/s]
Testing epoch 1/5: 100%|██████████| 899/899 [06:25<00:00,  2.33it/s]


Completed training and testing for epoch 1/5:
 Train Metrics = {'train_accuracy': 0.7373574012242626, 'train_precision': 0.7381219506448011, 'train_f1': 0.7367304330670529, 'train_loss': 0.9146396412666435, 'test_accuracy': 0.7965880634390651, 'test_precision': 0.799917302554428, 'test_f1': 0.7956574989907677, 'test_loss': 0.6903152613671656}. 
 Test Metrics: {'test_accuracy': 0.7965880634390651, 'test_precision': 0.799917302554428, 'test_f1': 0.7956574989907677, 'test_loss': 0.6903152613671656}


Training epoch 2/5: 100%|██████████| 899/899 [11:53<00:00,  1.26it/s]
Testing epoch 2/5: 100%|██████████| 899/899 [06:26<00:00,  2.33it/s]


Completed training and testing for epoch 2/5:
 Train Metrics = {'train_accuracy': 0.7938056483027268, 'train_precision': 0.793540413208756, 'train_f1': 0.7932788750618518, 'train_loss': 0.6951403864830301, 'test_accuracy': 0.8375765164162493, 'test_precision': 0.8383366238997654, 'test_f1': 0.8370734514587335, 'test_loss': 0.5682693778043064}. 
 Test Metrics: {'test_accuracy': 0.8375765164162493, 'test_precision': 0.8383366238997654, 'test_f1': 0.8370734514587335, 'test_loss': 0.5682693778043064}


Training epoch 3/5: 100%|██████████| 899/899 [11:52<00:00,  1.26it/s]
Testing epoch 3/5: 100%|██████████| 899/899 [06:00<00:00,  2.49it/s]


Completed training and testing for epoch 3/5:
 Train Metrics = {'train_accuracy': 0.8294727323316639, 'train_precision': 0.8295286238589586, 'train_f1': 0.8291463962846044, 'train_loss': 0.5914982996375463, 'test_accuracy': 0.8732957707289928, 'test_precision': 0.877940804239007, 'test_f1': 0.8732726465426783, 'test_loss': 0.455683169544605}. 
 Test Metrics: {'test_accuracy': 0.8732957707289928, 'test_precision': 0.877940804239007, 'test_f1': 0.8732726465426783, 'test_loss': 0.455683169544605}


Training epoch 4/5: 100%|██████████| 899/899 [11:24<00:00,  1.31it/s]
Testing epoch 4/5: 100%|██████████| 899/899 [06:00<00:00,  2.50it/s]


Completed training and testing for epoch 4/5:
 Train Metrics = {'train_accuracy': 0.8654180578742349, 'train_precision': 0.8656319401002629, 'train_f1': 0.8652923496952457, 'train_loss': 0.47920647309937653, 'test_accuracy': 0.9194316917084029, 'test_precision': 0.9196737769496999, 'test_f1': 0.9194065542435921, 'test_loss': 0.32165953207831494}. 
 Test Metrics: {'test_accuracy': 0.9194316917084029, 'test_precision': 0.9196737769496999, 'test_f1': 0.9194065542435921, 'test_loss': 0.32165953207831494}


Training epoch 5/5: 100%|██████████| 899/899 [11:25<00:00,  1.31it/s]
Testing epoch 5/5: 100%|██████████| 899/899 [06:00<00:00,  2.50it/s]


Completed training and testing for epoch 5/5:
 Train Metrics = {'train_accuracy': 0.8991896215915415, 'train_precision': 0.8993334219902533, 'train_f1': 0.8991428217066244, 'train_loss': 0.3713033913041116, 'test_accuracy': 0.943430022259321, 'test_precision': 0.9437826219212042, 'test_f1': 0.9434335779717741, 'test_loss': 0.2298735607253829}. 
 Test Metrics: {'test_accuracy': 0.943430022259321, 'test_precision': 0.9437826219212042, 'test_f1': 0.9434335779717741, 'test_loss': 0.2298735607253829}




({'train_accuracy': 0.8991896215915415,
  'train_precision': 0.8993334219902533,
  'train_f1': 0.8991428217066244,
  'train_loss': 0.3713033913041116,
  'test_accuracy': 0.943430022259321,
  'test_precision': 0.9437826219212042,
  'test_f1': 0.9434335779717741,
  'test_loss': 0.2298735607253829},
 {'test_accuracy': 0.943430022259321,
  'test_precision': 0.9437826219212042,
  'test_f1': 0.9434335779717741,
  'test_loss': 0.2298735607253829})

In [31]:
model.save_pretrained("distilbert-finetuned-naijaweb")

In [32]:
tokenizer.save_pretrained("distilbert-finetuned-naijaweb")

('distilbert-finetuned-naijaweb/tokenizer_config.json',
 'distilbert-finetuned-naijaweb/special_tokens_map.json',
 'distilbert-finetuned-naijaweb/vocab.txt',
 'distilbert-finetuned-naijaweb/added_tokens.json',
 'distilbert-finetuned-naijaweb/tokenizer.json')

In [33]:
model.push_to_hub("distilbert-finetuned-naijaweb")
tokenizer.push_to_hub("distilbert-finetuned-naijaweb")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/muyiiwaa/distilbert-finetuned-naijaweb/commit/aaa3739328427e08b4f274a8fb73513ecb43a6ed', commit_message='Upload tokenizer', commit_description='', oid='aaa3739328427e08b4f274a8fb73513ecb43a6ed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/muyiiwaa/distilbert-finetuned-naijaweb', endpoint='https://huggingface.co', repo_type='model', repo_id='muyiiwaa/distilbert-finetuned-naijaweb'), pr_revision=None, pr_num=None)