In [1]:
# ✅ Step 1: Install Dependencies
!pip install transformers datasets torch gradio accelerate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from huggingface_hub import HfApi, login

In [3]:
# ✅ Step 2: Load the Balanced Dataset
df = pd.read_csv("balanced_dataset.csv")  # Make sure it's balanced!
df = df[["Hit Sentence", "label"]].dropna()  # Remove missing values


In [4]:
# ✅ 2. Convert to Hugging Face Dataset
df = df.rename(columns={"Hit Sentence": "text"})  # Rename for HF compatibility



In [5]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = Dataset.from_pandas(df)

In [6]:
# ✅ 3. Load Tokenizer & Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# ✅ 4. Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [8]:
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.train_test_split(test_size=0.2)


Map:   0%|          | 0/26889 [00:00<?, ? examples/s]

In [13]:
!pip install evaluate





In [18]:
import evaluate
metric = evaluate.load("accuracy")


In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    evaluation_strategy="epoch",  # Evaluate at every epoch
    save_strategy="epoch",  # Save best model at each epoch
    learning_rate=5e-6,  # 🔥 Reduced learning rate to prevent overfitting
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # 🔥 Reduced to 3 epochs
    weight_decay=0.01,  # 🔥 L2 Regularization (Prevents complexity)
    load_best_model_at_end=True,  # Keep best model
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="wandb",  # Log to Weights & Biases
)

# ✅ Initialize Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # 🔥 Stop if no improvement
)

# ✅ Train & Evaluate
trainer.train()
trainer.evaluate()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0001,0.009508,0.998698
2,0.0001,0.009723,0.998884
3,0.0,0.010304,0.998884


{'eval_loss': 0.009723406285047531,
 'eval_accuracy': 0.9988843436221644,
 'eval_runtime': 157.722,
 'eval_samples_per_second': 34.098,
 'eval_steps_per_second': 4.267,
 'epoch': 3.0}

In [21]:
# ✅ 7. Save Model & Tokenizer
model.save_pretrained("bert_fine_tuned")
tokenizer.save_pretrained("bert_fine_tuned")

('bert_fine_tuned/tokenizer_config.json',
 'bert_fine_tuned/special_tokens_map.json',
 'bert_fine_tuned/vocab.txt',
 'bert_fine_tuned/added_tokens.json')

In [22]:
# ✅ 8. Upload to Hugging Face
HUGGING_FACE_TOKEN = "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"
login(HUGGING_FACE_TOKEN)


In [23]:
repo_name = "hate_speech_detector_bert"
api = HfApi()
api.create_repo(repo_name, exist_ok=True)


RepoUrl('https://huggingface.co/JCKipkemboi/hate_speech_detector_bert', endpoint='https://huggingface.co', repo_type='model', repo_id='JCKipkemboi/hate_speech_detector_bert')

In [24]:
api.upload_folder(
    folder_path="bert_fine_tuned",
    repo_id=f"JCKipkemboi/{repo_name}",
)

print(" BERT Model uploaded to Hugging Face!")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

 BERT Model uploaded to Hugging Face!


In [26]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[

In [27]:
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

In [28]:

!pip install transformers torch
!pip install pyngrok  # Alternative for exposing Streamlit


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [7]:
%%writefile app.py
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load Model & Tokenizer
model_name = "JCKipkemboi/hate_speech_detector"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Hate Speech" if prediction == 1 else "Not Hate Speech"

# Streamlit UI
st.title("Hate Speech Detector")
st.write("Enter a sentence to check if it's hate speech.")

user_input = st.text_area("Enter text:")
if st.button("Predict"):
    result = classify_text(user_input)
    st.write(f"**Prediction:** {result}")


Writing app.py


In [1]:
%%writefile requirements.txt
transformers
torch
fastapi
uvicorn
gradio


Writing requirements.txt


In [2]:
%%writefile Procfile
web: uvicorn app:app --host=0.0.0.0 --port=${PORT}


Writing Procfile


In [3]:
!ls


Procfile  requirements.txt  sample_data


In [8]:
from google.colab import files

# Replace with the actual file names you want to download
##files.download("Untitled11.ipynb")
files.download("requirements.txt")
files.download("Procfile")
files.download("app.py")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>