<a href="https://colab.research.google.com/github/Koekoele/JavaFXGallery/blob/main/modeltraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
%pip install -U transformers

Collecting transformers
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Downloading transformers-5.2.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.2.0


In [7]:
%pip install -q datasets scikit-learn evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import sys
!{sys.executable} -m pip install torch torchvision torchaudio



In [76]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
import numpy as np
import pandas as pd
from datasets import Dataset

In [77]:
from google.colab import files
import io

# Upload the dataset file
uploaded = files.upload()

# Assuming the uploaded file is 'dataset.csv'
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]))
df.head(5)

Saving dataset.csv to dataset (1).csv


Unnamed: 0,Label,text
0,Offensive,ba entse phoso ba ska monyoba ka tronkong
1,Offensive,le nka mobolaea
2,Hate,baruti ba kereke ena hantle hoetsahalang ka bo...
3,Hate,ekare baka chesoa batho ba khopo hakaalo ka ma...
4,Hate,ke matekatsi hampe linyalasi tsena kere lebasa...


In [79]:
#create a number label
df['hate'] = df['Label'].apply(lambda x : 1 if x=='Offensive' or x=='Hate' else 0)
df.sample(5)

Unnamed: 0,Label,text,Hate,hate
2794,Neutral,le wena ako tlohelle ho rwakana hle,0,0
2592,Offensive,haothola osheba hona sekolong mono hona le mis...,1,1
36,Neutral,ebe haho kaba hwatla kelellong ya bona hore ba...,0,0
2002,Neutral,ho jeremiah maria unaso hlahelle tsebelisong e...,0,0
3016,Neutral,hela tlohelang bo khaitseli ba thusa sechaba b...,0,0


In [81]:
df['label'] = df['Label'].isin(['Hate', 'Offensive']).astype(int)

print(df['label'].value_counts())

label
1    1511
0    1511
Name: count, dtype: int64


In [82]:
#load model
model_checkpoint  = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [83]:
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("Label", "labels")

In [84]:
#preprocessing or tokenize data
def preprocess_function(example):
    return tokenizer(example['text'], truncation=True, padding="max_length")
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3022 [00:00<?, ? examples/s]

In [94]:
#Load model for classification
#num_labels = df["Hate"].nunique()
# 2-class classification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Freeze all encoder parameters
for param in model.base_model.parameters():
    param.requires_grad = False

# Unfreeze the last 2 encoder layers
for layer in model.base_model.encoder.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Make pooler and classifier trainable
if hasattr(model.base_model, "pooler") and model.base_model.pooler is not None:
    for param in model.base_model.pooler.parameters():
        param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaForSequenceClassification LOAD REPORT[0m from: Davlan/afro-xlmr-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [95]:
  text = "Banna ba Lesotho ke litja"

inputs = tokenizer(
    text,
    return_tensors="pt",
    padding=True,
    truncation=True
)

In [96]:
# assuming your tokenized input is called 'inputs'
input_ids = inputs['input_ids'][0]  # take the first example
input_ids_list = input_ids.tolist()  # convert tensor to list
tokens = tokenizer.convert_ids_to_tokens(input_ids_list)
print(tokens)

['<s>', '▁Ban', 'na', '▁ba', '▁Les', 'ot', 'ho', '▁ke', '▁lit', 'ja', '</s>']


In [101]:
#check model trainable parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        "Total": total_params,
        "Trainable": trainable_params,
        "Frozen": total_params - trainable_params
    }
param_counts = count_parameters(model)
print('Model Parameters Counts: ')
for k, v in param_counts.items():
    print(f"{k} : {v:,}")

Model Parameters Counts: 
Total : 278,045,186
Trainable : 14,767,874
Frozen : 263,277,312


In [102]:
import torch


In [103]:
#check model outputs
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
print(probs)
print(outputs)

tensor([[0.4671, 0.5329]], grad_fn=<SoftmaxBackward0>)
SequenceClassifierOutput(loss=None, logits=tensor([[-0.1229,  0.0090]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [104]:
df.head()

Unnamed: 0,Label,text,Hate,hate,label
0,Offensive,ba entse phoso ba ska monyoba ka tronkong,1,1,1
1,Offensive,le nka mobolaea,1,1,1
2,Hate,baruti ba kereke ena hantle hoetsahalang ka bo...,1,1,1
3,Hate,ekare baka chesoa batho ba khopo hakaalo ka ma...,1,1,1
4,Hate,ke matekatsi hampe linyalasi tsena kere lebasa...,1,1,1


In [105]:
df.groupby('Label').describe()

Unnamed: 0_level_0,Hate,Hate,Hate,Hate,Hate,Hate,Hate,Hate,hate,hate,hate,hate,hate,label,label,label,label,label,label,label,label
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Hate,484.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,484.0,1.0,...,1.0,1.0,484.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Neutral,1511.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1511.0,0.0,...,0.0,0.0,1511.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Offensive,1027.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1027.0,1.0,...,1.0,1.0,1027.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [64]:
from sklearn.model_selection import train_test_split

# Assuming your dataset is df with columns 'text' and 'label'
train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

In [106]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [107]:
#preprocessing or tokenize data
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length')

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2417 [00:00<?, ? examples/s]

Map:   0%|          | 0/605 [00:00<?, ? examples/s]

In [108]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2)

for param in model.base_model.parameters():
    param.requires_grad = False

for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaForSequenceClassification LOAD REPORT[0m from: Davlan/afro-xlmr-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [109]:
#check model trainable parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        "Total": total_params,
        "Trainable": trainable_params,
        "Frozen": total_params - trainable_params
    }
param_counts = count_parameters(model)
print('Model Parameters Counts: ')
for k, v in param_counts.items():
    print(f"{k} : {v:,}")

Model Parameters Counts: 
Total : 278,045,186
Trainable : 14,767,874
Frozen : 263,277,312


In [110]:
#check model outputs
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
print(probs)
print(outputs)

tensor([[0.4712, 0.5288]], grad_fn=<SoftmaxBackward0>)
SequenceClassifierOutput(loss=None, logits=tensor([[-0.1122,  0.0032]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [111]:
import evaluate
import numpy as np

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred  # lowercase 'labels' is standard
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [112]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [113]:
#initialize trainer

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,

)

In [114]:
#train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.640012,0.60401,0.657851
2,0.572473,0.551365,0.705785
3,0.505538,0.532573,0.740496


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=456, training_loss=0.6201830799119514, metrics={'train_runtime': 482.4954, 'train_samples_per_second': 15.028, 'train_steps_per_second': 0.945, 'total_flos': 1907818262415360.0, 'train_loss': 0.6201830799119514, 'epoch': 3.0})

In [140]:
#classify based on labels
label_map = {
    "Hate/Offensive": 1,
    "Neutral": 0
}

In [141]:
import torch
import torch.nn.functional as F

# Invert label_map to get ID → label
id2label = {v: k for k, v in label_map.items()}

def classify(text):
    # Tokenize the text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  # Convert logits to probabilities
        predicted_class_id = probs.argmax(dim=-1).item()
        predicted_label = id2label[predicted_class_id]
        predicted_prob = probs[0, predicted_class_id].item()

    return predicted_label, predicted_prob

In [143]:
label, confidence = classify("ipolae monna hao keeng bothata")
print(f"Predicted label: {label}, Confidence: {confidence:.4f}")

Predicted label: Hate/Offensive, Confidence: 0.5954


In [37]:
print(classify("he banna ke taba tse kakang"))

Hate/Offensive
