**Data Preparation**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('labeled_data.csv')
data.dropna(subset=['tweet', 'class'], inplace=True)

data['class'] = data['class'].astype(int)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['tweet'].tolist(),
    data['class'].tolist(),
    test_size=0.2,
    random_state=42
)


**Tokenization**


In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

**Dataset Class**

In [3]:
import torch
from torch.utils.data import Dataset

class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)


**Metrics Calculation**

In [4]:
import numpy as np

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}


**Model Initialization and Training**

In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3083,0.287505,0.907
2,0.1276,0.302899,0.911842
3,0.0592,0.353787,0.912245


TrainOutput(global_step=7437, training_loss=0.2769182712887586, metrics={'train_runtime': 992.2694, 'train_samples_per_second': 59.941, 'train_steps_per_second': 7.495, 'total_flos': 1956182482453248.0, 'train_loss': 0.2769182712887586, 'epoch': 3.0})

**Validation accuracy and Tranining accuracy**

In [6]:

train_result = trainer.evaluate(eval_dataset=train_dataset)
train_accuracy = train_result["eval_accuracy"]

val_result = trainer.evaluate(eval_dataset=val_dataset)
val_accuracy = val_result["eval_accuracy"]

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")


Training Accuracy: 0.9618
Validation Accuracy: 0.9122


In [7]:

model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

**checking new comments**

In [11]:
import nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

loaded_model = BertForSequenceClassification.from_pretrained('./saved_model')
loaded_tokenizer = BertTokenizer.from_pretrained('./saved_model')


def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    words = word_tokenize(text)
    return ' '.join(words)

def detect_abusive_comment(comment):
    preprocessed_comment = preprocess_text(comment)
    encoded_dict = loaded_tokenizer.encode_plus(
        preprocessed_comment,
        add_special_tokens=True,
        max_length=64,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    with torch.no_grad():
        outputs = loaded_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()

    return predicted_label


new_comments = [
    "I hate you!",
    "Have a nice day!",
    "You are stupid!",
    "Wishing you all the best!",
    "bitch",
    "bloody hell",
    "bastard",
    "idiot",

]

for comment in new_comments:
    result = detect_abusive_comment(comment)
    if result == 1:
        print(f"Comment: '{comment}' is offensive.")
    elif result == 0:
        print(f"Comment: '{comment}' is hate speech.")
    else:
        print(f"Comment: '{comment}' is safe.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Comment: 'I hate you!' is hate speech.
Comment: 'Have a nice day!' is safe.
Comment: 'You are stupid!' is hate speech.
Comment: 'Wishing you all the best!' is safe.
Comment: 'bitch' is offensive.
Comment: 'bloody hell' is offensive.
Comment: 'bastard' is offensive.
Comment: 'idiot' is offensive.


**Save Model as Pickle file**

In [12]:
import torch
import pickle
from transformers import BertForSequenceClassification, BertTokenizer


model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')


model_save_path = './saved_model.pkl'
with open(model_save_path, 'wb') as f:
    pickle.dump(model.state_dict(), f)


tokenizer_save_path = './saved_tokenizer.pkl'
with open(tokenizer_save_path, 'wb') as f:
    pickle.dump(tokenizer, f)

print(f"Model and tokenizer saved to {model_save_path} and {tokenizer_save_path}")

Model and tokenizer saved to ./saved_model.pkl and ./saved_tokenizer.pkl
