<a href="https://www.kaggle.com/code/mirahaem/amharic-hatespeech-detector?scriptVersionId=213444778" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#installing the datasets package
!pip install datasets

In [None]:
#installing the transformers package
!pip install transformers

In [None]:
#installing the evaluate package
! pip install evaluate

In [None]:
#importing the datasets package
from datasets import Dataset
import datasets
import evaluate

In [None]:

#import numpy and pandas for mathematical computation and data manipulation respectively
import numpy as np
import pandas as pd
#import the pipeline of transformers
from transformers import pipeline
#import AutoTokenizer for tokenization purposes
from transformers import AutoTokenizer

In [None]:
#import torch
import torch
#import Data loader from torch
from torch.utils.data import DataLoader
#import an optimizer
from torch.optim import AdamW
#import tqdm for a progress bar
from tqdm.auto import tqdm


from sklearn.model_selection import train_test_split


# Step 3: Import the dataset to be used for Training the model

The dataset used for this project is an Amharic dataset that was made available by Data Mendeley. It contains Amharic posts and comments retrieved from Facebook. It has 30,000 rows. The dataset can be accessed from [here](https://data.mendeley.com/datasets/ymtmxx385m)


In [None]:

# Paths to the uploaded datasets
test_data_path = '/kaggle/input/amhsdatasets/AMHSDataTest.txt'  # Replace with the name of your uploaded test file
train_data_path = '/kaggle/input/amhsdatasets/AMHSDataTrain(1).txt'  # Replace with the name of your uploaded train file

# Load and process the test dataset
Test = pd.read_csv(test_data_path, header=None, names=['raw'])
Test['content'] = Test['raw'].apply(lambda x: ','.join(x.split(',')[:-1]))
Test['label'] = Test['raw'].apply(lambda x: x.split(',')[-1])

# Load and process the train dataset
Train = pd.read_csv(train_data_path, header=None, names=['raw'])
Train['content'] = Train['raw'].apply(lambda x: ','.join(x.split(',')[:-1]))
Train['label'] = Train['raw'].apply(lambda x: x.split(',')[-1])

# Drop the raw column (optional)
Train = Train.drop(columns=['raw'])
Test = Test.drop(columns=['raw'])

In [None]:
# Display the processed DataFrame
(Train.head(-10))

In [None]:
Test.head(-10)

In [None]:
Train['label'] = Train['label'].replace(['መልካም', 'ጥላቻ ','ጥላቻ'],[0,1,1])
(Train.head(-10))

In [None]:
Test['label'] = Test['label'].replace(['መልካም', 'መልካም', 'መልካም', 'ጥላቻ ','ጥላቻ'], [0, 0, 0, 1,  1])
Test.head(-10)



# Step 4: Preprocess the Dataset

When the dataset was retrived, the labels and the post were in different files.

  - Hence, the first step in this phase is merging the files into one panda's dataframe.
  - Second step is Label encoding. Lable encoding is the process of converting the labels(classes) into numeric format to make it easier for the machine to understand it
  - Third step is dividing the dataset into training, validation and testing categories. The division ratio is 7:1:2 respectively.
  - Last step is to remove an unncessary columns from the main dataset and merging the all the categories into one main dataset



In [None]:
Test['label'] = Test['label'].replace({'መልካም': 0})
Test['label'] = Test['label'].replace({'ጥላቻ': 1})
Test = Test[Test['label'] != 'Label']
Test['label'] = pd.to_numeric(Test['label'], errors='coerce')  # Invalid entries become NaN
Test = Test.dropna(subset=['label'])
Test['label'] = Test['label'].astype(int)
Test.head(-10)

In [None]:
# trainn_dataset, evaluat_dataset = train_test_split(Train, test_size=0.75, random_state=42)
train_dataset, evaluation_dataset = train_test_split(Train, test_size=0.20, random_state=42)
test_dataset, eval_test_dataset = train_test_split(Test, test_size=0.001, random_state=42)
print('Training dataset shape: ', train_dataset.shape)
print('Validation dataset shape: ', evaluation_dataset.shape)
print('Testing dataset shape: ', test_dataset.shape)

In [None]:
#convert format of the dataset to HuggingFace Dataset from Pandas DataFrame
train_dataset['label'] = train_dataset['label'].replace({'መልካም': 0})
train_dataset['label'] = train_dataset['label'].replace({'ጥላቻ': 1})
train_dataset = train_dataset[train_dataset['label'] != 'Label']
train_dataset['label'] = pd.to_numeric(train_dataset['label'], errors='coerce')  # Invalid entries become NaN
train_dataset = train_dataset.dropna(subset=['label'])
train_dataset['label'] = train_dataset['label'].astype(int)

In [None]:
#convert format of the dataset to HuggingFace Dataset from Pandas DataFrame
evaluation_dataset['label'] = evaluation_dataset['label'].replace({'መልካም': 0})
evaluation_dataset['label'] = evaluation_dataset['label'].replace({'ጥላቻ': 1})
evaluation_dataset = evaluation_dataset[evaluation_dataset['label'] != 'Label']
evaluation_dataset['label'] = pd.to_numeric(evaluation_dataset['label'], errors='coerce')  # Invalid entries become NaN
evaluation_dataset = evaluation_dataset.dropna(subset=['label'])
evaluation_dataset['label'] = evaluation_dataset['label'].astype(int)

In [None]:
#convert format of the dataset to HuggingFace Dataset from Pandas DataFrame
test_dataset=Dataset.from_pandas(test_dataset)

#convert the format of the dataset to HuggingFace Dataset from Pandas DataFrame
train_dataset=Dataset.from_pandas(train_dataset)

#convert the format of the dataset to HuggingFace Dataset from Pandas DataFrame
evaluation_dataset=Dataset.from_pandas(evaluation_dataset)

#preview of the dataset after conversion
(test_dataset)


In [None]:
#combine the train and test dataset into one datset
main_dataset= datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'evaluate': evaluation_dataset
})

#preview of the dataset after merging
main_dataset

In [None]:
# training and testing data size
training_data_size = main_dataset['train'].num_rows
testing_data_size = main_dataset['test'].num_rows
evaluation_data_size = main_dataset['evaluate'].num_rows

# Step 5: Tokenizing Dataset


A Tokenizer is used to translate text into data that can be processed by the model. Models can only process numbers, so tokenizers need to convert our text inputs to numerical data.

In this case, the tokenizer used is an AutoTokenizer from the fine-tuned mBERT model made available by Hugging face here

In this phase, we have the following tasks:

- Load the tokenizer
- Create a tokenizer function that takes the dataset in batches and tokenize them using the tokenizer loaded from the model
- Call the tokenizer function on the whole dataset



In [None]:
# Specify a custom cache directory
tokenizer = AutoTokenizer.from_pretrained(
    "Davlan/bert-base-multilingual-cased-finetuned-amharic",
    cache_dir="/kaggle/working/"
)

In [None]:
#loading a tokenizer from the pretrained model
tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-amharic")


In [None]:
#Have a tokenizer function that uses the tokenizer
def tokenize_function(data):
    return tokenizer(data["content"], padding="max_length", truncation=True)


#Tokenize all the data using the mapping functionality
tokenized_datasets = main_dataset.map(tokenize_function)

In [None]:
#empty cache
torch.cuda.empty_cache()


# Step 6: Prepare the tokenized Dataset

In this phase, we do the following tasks:

  - Remove unnecessary columns such as the "posts" column from the tokenized dataset as we no longer need them
  -  Change the format of the tokenized dataset into pytorch since we are using pytorch
  - Load the dataset using DataLoader with the proper batch size
  - Preview the features of the dataset to make sure everything is okay



In [None]:

#remove the posts column as it is no longer needed
tokenized_datasets = tokenized_datasets.remove_columns(["content"])


#changing the format of the tokenized dataset to torch
tokenized_datasets.set_format("torch")


#shuffeling and selecting the needed size of dataset for training and evaluating the model
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(training_data_size))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(testing_data_size))
small_eval_dataset = tokenized_datasets["evaluate"].shuffle(seed=42).select(range(evaluation_data_size))


# preview of the shuffeled and selected evaluation dataset
small_eval_dataset

In [None]:
# preview of the shuffeled and selected training dataset
small_train_dataset

In [None]:
# preview of the shuffeled and selected testing dataset
small_test_dataset

In [None]:
#load the dataset using DataLoader
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=4)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=4)
test_dataloader = DataLoader(small_test_dataset, batch_size=4)

# Step 7: Fine-tune the model

This phase has the following steps:

  - Load the model
  - Specify the computing metric
  - Specify the Training/fine-tuning arguments
  - Load the Trainer class
  - Fine-tune the model

## 7.1 Load the model
We load the fine-tuned mBERT mode in this step
 Step 7: Fine-tune the model

This phase has the following steps:

  - Load the model
  - Specify the computing metric
  - Specify the Training/fine-tuning arguments
  - Load the Trainer class
  - Fine-tune the model




In [None]:
#Load auto mode classifier from the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-amharic", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("Davlan/", num_labels=2)

## 7.2 Computing Metrics
In this stage, we load the computing metrics. The computing metrics used in this phase are the f1-score and the accuracy. These computing metrics are used during the validation and testing phase


In [None]:
import numpy as np
import evaluate


# Load the desired metrics
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute each metric
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

    return {"f1": f1, "accuracy": accuracy}


## 7.2 Computing Metrics
In this stage, we load the computing metrics. The computing metrics used in this phase are the f1-score and the accuracy. These computing metrics are used during the validation and testing phase

In [None]:

# #load an optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
from transformers import TrainingArguments, Trainer

from transformers import EarlyStoppingCallback, IntervalStrategy

In [None]:

training_args = TrainingArguments(
   f"training_with_callbacks",
   evaluation_strategy = IntervalStrategy.STEPS, # "steps"
   warmup_steps=500,                # number of warmup steps for learning rate
   save_steps=2000,
   eval_steps = 2000, # Evaluation and Save happens every 50 steps
   save_total_limit = 3, # Only last 5 models are saved. Older ones are deleted.
   learning_rate=1e-5,
   per_device_train_batch_size=4,
   per_device_eval_batch_size=4,
   num_train_epochs=10,
   weight_decay=0.01,
   push_to_hub=False,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True)

## 7.4 Load the Trainer class
In the trainer class, early stopping strategy is called. Early Stopping is a an optimization technique used to reduce overfitting without compromising on model accuracy. It allows to specify an arbitrary large number of training epochs and stop training once the model performance stops improving on a hold out validation dataset. For this model, the early stopping patience used is 10 epoches.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
)

## 7.5 Fine-tune the model
Fine-tuning process embbeds the validation within itself. After every 2000 steps of finetuning, the model is validated on the loaded computing metrics to modify the hyperparameters to make the model perform well

In [None]:
trainer.train()


# Step 8: Test the model

In this stage the model is tested on the testing dataset. This dataset isn't seen by the model during the finetuning process.

In [None]:
trainer.evaluate(small_test_dataset)

# 1. Save Your Fine-Tuned Model Locally

After fine-tuning, your model and tokenizer reside in memory. Save them to a directory for reuse.

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os

# Replace `model` and `tokenizer` with your fine-tuned ones
# Specify the directory where you want to save the model
save_directory = "/kaggle/working/saved_model"

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to: {save_directory}")
print("\nVerifying saved files...")
print(os.listdir(save_directory))


# Step 9: Push the model to Huggingface Hub

One of the aims of this research/project is to contribute to the IT community in the sector of NLP tasks on low-resourced languages. Hence, the final model was pushed and made publicly available on Huggingface. You can find the model on huggingface here

In [None]:
#install huggingface_hub package to interact with huggingface platform
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

#login to huggingface
notebook_login()

In [None]:
#loading a tokenizer from the pretrained model
tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-amharic")

In [None]:
from transformers import AutoModelForSequenceClassification

loaded_model = AutoModelForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

# model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/fine-tuned_BERT/saved_model")

#specify the path for the trainde model and tokenizer to huggingface repository
loaded_model.save_pretrained("path/to/amharic-hate-speech-detection-mBERT")
loaded_tokenizer.save_pretrained("path/to/amharic-hate-speech-detection-mBERT")

In [None]:

#push the trained model to huggingface repository
loaded_model.push_to_hub("amharic-hate-speech-detection-mBERT")

In [None]:
#push the tokenizer to huggingface repository
loaded_tokenizer.push_to_hub("amharic-hate-speech-detection-mBERT")

In [None]:
loaded_model.save_pretrained(save_directory)