In [1]:
# ✅ Step 1: Install Dependencies
!pip install transformers datasets torch gradio accelerate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from huggingface_hub import HfApi, login

In [3]:
# ✅ Step 2: Load the Balanced Dataset
df = pd.read_csv("balanced_dataset.csv")  # Make sure it's balanced!
df = df[["Hit Sentence", "label"]].dropna()  # Remove missing values

In [4]:
# ✅ 2. Convert to Hugging Face Dataset
df = df.rename(columns={"Hit Sentence": "text"})  # Rename for HF compatibility



In [5]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = Dataset.from_pandas(df)

In [6]:
# ✅ 3. Load Tokenizer & Model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ✅ 4. Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [8]:
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.train_test_split(test_size=0.2)

Map:   0%|          | 0/26889 [00:00<?, ? examples/s]

In [14]:
!pip install evaluate



In [18]:
import evaluate
metric = evaluate.load("accuracy")



In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
 ##✅ Optimized Training Arguments (Prevents Overfitting & Runtime Issues)
training_args = TrainingArguments(
    output_dir="./mbert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,  # 🔥 Lower learning rate to prevent overfitting
    per_device_train_batch_size=4,  # 🔥 Reduce batch size to avoid crashes
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # 🔥 Reduce epochs to avoid overfitting
    weight_decay=0.02,  # 🔥 Prevents overfitting by penalizing large weights
    gradient_accumulation_steps=2,  # 🔥 Prevents memory issues
    logging_steps=50,
    save_total_limit=1,  # 🔥 Prevents disk overload
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # 🔥 Stops early if validation loss worsens
)

# ✅ Train & Evaluate
trainer.train()
trainer.evaluate()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0343,0.012635,0.998327
2,0.0221,0.008621,0.998698
3,0.0001,0.005978,0.999256


{'eval_loss': 0.005978025030344725,
 'eval_accuracy': 0.999256229081443,
 'eval_runtime': 163.9677,
 'eval_samples_per_second': 32.799,
 'eval_steps_per_second': 8.203,
 'epoch': 3.0}

In [21]:
# ✅ 7. Save Model & Tokenizer
model.save_pretrained("mbert_fine_tuned")
tokenizer.save_pretrained("mbert_fine_tuned")


('mbert_fine_tuned/tokenizer_config.json',
 'mbert_fine_tuned/special_tokens_map.json',
 'mbert_fine_tuned/vocab.txt',
 'mbert_fine_tuned/added_tokens.json')

In [22]:
# ✅ 8. Upload to Hugging Face
HUGGING_FACE_TOKEN = "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"
login(HUGGING_FACE_TOKEN)


In [23]:
repo_name = "hate_speech_detector_mbert"
api = HfApi()
api.create_repo(repo_name, exist_ok=True)

RepoUrl('https://huggingface.co/JCKipkemboi/hate_speech_detector_mbert', endpoint='https://huggingface.co', repo_type='model', repo_id='JCKipkemboi/hate_speech_detector_mbert')

In [24]:
api.upload_folder(
    folder_path="mbert_fine_tuned",
    repo_id=f"JCKipkemboi/{repo_name}",
)

print(" mBERT Model uploaded to Hugging Face!")

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

 mBERT Model uploaded to Hugging Face!


In [25]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[

In [26]:
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

In [27]:
!pip install transformers torch
!pip install pyngrok  # Alternative for exposing Streamlit


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [28]:
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the mBERT model & tokenizer
model_name = "JCKipkemboi/hate_speech_detector_mbert"  # Make sure this is correct!
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

st.title("Multilingual Hate Speech Detector (mBERT)")
st.write("Enter a sentence to check if it's hate speech or not.")

# Input box
user_input = st.text_input("Enter text:")

if user_input:
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()

    # Display prediction
    result = "Hate Speech" if prediction == 1 else "Not Hate Speech"
    st.write(f"**Prediction:** {result}")


config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

2025-03-23 20:47:37.960 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-03-23 20:47:37.984 Session state does not function when running a script without `streamlit run`


In [29]:
%%writefile requirements.txt
transformers
torch
fastapi
uvicorn
gradio


Writing requirements.txt


In [30]:
%%writefile app.py

import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the mBERT model & tokenizer
model_name = "JCKipkemboi/hate_speech_detector_mbert"  # Make sure this matches your Hugging Face model!
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Streamlit UI
st.title("Multilingual Hate Speech Detector (mBERT)")
st.write("Enter a sentence to check if it's hate speech or not.")

# Input box
user_input = st.text_input("Enter text:")

if user_input:
    # Tokenize input
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()

    # Display prediction result
    result = "Hate Speech" if prediction == 1 else "Not Hate Speech"
    st.write(f"**Prediction:** {result}")


Writing app.py


In [31]:
%%writefile Procfile
web: streamlit run app.py --server.port $PORT --server.address 0.0.0.0


Writing Procfile
