<a href="https://colab.research.google.com/github/NadhemBenhadjali/-Swahili-News-Classification-LLM-Finetuning-Multiclassification/blob/main/LLM_Finetuning_Multiclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torch lightning matplotlib pandas torchmetrics watermark transformers datasets -U




In [2]:
import os
import time
from datasets import Dataset
from lightning import Fabric
import torch
from torch.utils.data import DataLoader
import torchmetrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from watermark import watermark
import pandas as pd
import numpy as np

# Define the tokenize function
def tokenize_text(batch):
    return tokenizer(batch["content"], truncation=True, padding=True, max_length=512)

# Define the training function
def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
    for epoch in range(num_epochs):
        train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=5).to(fabric.device)
        for batch_idx, batch in enumerate(train_loader):
            model.train()
            outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["category"])
            fabric.backward(outputs["loss"])
            optimizer.step()
            optimizer.zero_grad()
            if not batch_idx % 300:
                print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {outputs['loss']:.4f}")
            model.eval()
            with torch.no_grad():
                predicted_labels = torch.argmax(outputs["logits"], 1)
                train_acc.update(predicted_labels, batch["category"])
        model.eval()
        with torch.no_grad():
            val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=5).to(fabric.device)
            for batch in val_loader:
                outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["category"])
                predicted_labels = torch.argmax(outputs["logits"], 1)
                val_acc.update(predicted_labels, batch["category"])
            print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
            train_acc.reset(), val_acc.reset()




In [3]:
print(watermark(packages="torch,lightning,transformers", python=True))
print("Torch CUDA available?", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(123)
# torch.use_deterministic_algorithms(True)

# Load the new dataset
train_df = pd.read_csv('Train (11).csv')
test_df = pd.read_csv('Test (7).csv')

# Ensure the swahili_id column exists in the DataFrames
assert 'id' in train_df.columns, "Train DataFrame is missing 'swahili_id' column"
assert 'swahili_id' in test_df.columns, "Test DataFrame is missing 'swahili_id' column"

# Map category names to indices
category_map = {"Kitaifa": 0, "michezo": 1, "Biashara": 2, "Kimataifa": 3, "Burudani": 4}
train_df['category'] = train_df['category'].map(category_map)

# Split the train dataframe into train and validation dataframes
train_size = int(0.8 * len(train_df))
train_df, val_df = train_df.iloc[:train_size], train_df.iloc[train_size:]

# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m", max_length=512)
print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)

print("Tokenizing ...", flush=True)
train_dataset = train_dataset.map(tokenize_text, batched=True, batch_size=None)
val_dataset = val_dataset.map(tokenize_text, batched=True, batch_size=None)
test_dataset = test_dataset.map(tokenize_text, batched=True, batch_size=None)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "category", "id"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "category", "id"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "swahili_id"])
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, num_workers=2, drop_last=False)
val_loader = DataLoader(dataset=val_dataset, batch_size=4, num_workers=2, drop_last=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=4, num_workers=2, drop_last=False)

# Model
fabric = Fabric(accelerator="cuda", devices=1, precision="16-mixed")
fabric.launch()

model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=5)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

model, optimizer = fabric.setup(model, optimizer)
train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader, val_loader, test_loader)

# Finetuning
start = time.time()
train(num_epochs=1, model=model, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, fabric=fabric)
end = time.time()
elapsed = end-start
print(f"Time elapsed {elapsed/60:.2f} min")



Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

torch       : 2.3.1
lightning   : 2.2.5
transformers: 4.41.2

Torch CUDA available? True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizer input max length: 1000000000000000019884624838656
Tokenizer vocabulary size: 250680
Tokenizing ...


Map:   0%|          | 0/4120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1031 [00:00<?, ? examples/s]

Map:   0%|          | 0/1030 [00:00<?, ? examples/s]

INFO: Using 16-bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16-bit Automatic Mixed Precision (AMP)
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0001/0001 | Batch 0000/1030 | Loss: 19.7312
Epoch: 0001/0001 | Batch 0300/1030 | Loss: 0.1714
Epoch: 0001/0001 | Batch 0600/1030 | Loss: 0.1020
Epoch: 0001/0001 | Batch 0900/1030 | Loss: 0.0753


  self.pid = os.fork()
  self.pid = os.fork()


Epoch: 0001/0001 | Train acc.: 74.71% | Val acc.: 84.87%
Time elapsed 13.01 min


In [4]:
# create submission file
with torch.no_grad():
    model.eval()
    predictions = []
    for batch in test_loader:
        outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"])
        predicted_probs = torch.softmax(outputs["logits"], dim=1)
        predictions.append(predicted_probs.cpu().numpy())

predictions = np.vstack(predictions)
submission_df = pd.DataFrame(predictions, columns=["Kitaifa", "michezo", "Biashara", "Kimataifa", "Burudani"])
submission_df['test_id'] = test_df['swahili_id']
submission_df = submission_df[['test_id',"Kitaifa",
"michezo" ,
"Biashara" ,
"Kimataifa" ,
"Burudani"  ,     ]]
submission_df.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
