In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
file_path = '/content/hate_speech.xls'  # Update with your dataset path
df = pd.read_excel(file_path, names=["text", "label"])
df['text'] = df['text'].astype(str)

In [None]:
# Preprocess the dataset
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Convert labels to binary format
df['label'] = df['label'].apply(lambda x: 1 if x == 'yes' else 0)

# Define a custom Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float)
        }

# Split the dataset into training and validation sets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
max_length = 128
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

train_dataset = HateSpeechDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_length)
val_dataset = HateSpeechDataset(X_val.tolist(), y_val.tolist(), tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
class HateSpeechModel(nn.Module):
    def __init__(self):
        super(HateSpeechModel, self).__init__()
        self.roberta = AutoModel.from_pretrained("xlm-roberta-base")
        self.bilstm = nn.LSTM(768, 128, bidirectional=True, batch_first=True)
        self.attention = nn.Linear(256, 1)
        self.fc = nn.Linear(256, 64)
        self.out = nn.Linear(64, 1)

    def attention_layer(self, lstm_output, attention_weights):
        # Apply attention mechanism
        attention_weights = torch.softmax(attention_weights, dim=1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)
        return context_vector

    def forward(self, input_ids, attention_mask):
        # XLM-RoBERTa embeddings
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

        # BiLSTM
        lstm_output, _ = self.bilstm(roberta_output)

        # Attention mechanism
        attention_weights = self.attention(lstm_output)
        context_vector = self.attention_layer(lstm_output, attention_weights)

        # Fully connected layers
        x = torch.relu(self.fc(context_vector))
        x = torch.sigmoid(self.out(x))
        return x

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HateSpeechModel().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)


# Training loop with evaluation
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import torch

# Function to evaluate the model
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask).squeeze()
            preds = (outputs > 0.5).float()  # Convert probabilities to binary predictions

            predictions.extend(preds.cpu().numpy())  # Move to CPU for metrics computation
            true_labels.extend(labels.cpu().numpy())  # Move to CPU for metrics computation

    # Calculate metrics
    acc = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    return acc, precision, recall, f1

# Training loop with evaluation
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

    # Evaluate on validation dataset
    acc, precision, recall, f1 = evaluate_model(model, val_loader, device)
    print(f"Validation Metrics -> Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch 1/30, Loss: 0.6720


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics -> Accuracy: 0.7300, Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
Epoch 2/30, Loss: 0.6178


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics -> Accuracy: 0.7300, Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
Epoch 3/30, Loss: 0.6014


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Metrics -> Accuracy: 0.7300, Precision: 0.0000, Recall: 0.0000, F1-score: 0.0000
Epoch 4/30, Loss: 0.5583
Validation Metrics -> Accuracy: 0.8000, Precision: 0.7059, Recall: 0.4444, F1-score: 0.5455
Epoch 5/30, Loss: 0.5190
Validation Metrics -> Accuracy: 0.7850, Precision: 0.6122, Recall: 0.5556, F1-score: 0.5825
Epoch 6/30, Loss: 0.4990
Validation Metrics -> Accuracy: 0.7950, Precision: 0.6667, Recall: 0.4815, F1-score: 0.5591
Epoch 7/30, Loss: 0.4799
Validation Metrics -> Accuracy: 0.8150, Precision: 0.7073, Recall: 0.5370, F1-score: 0.6105
Epoch 8/30, Loss: 0.5086
Validation Metrics -> Accuracy: 0.7750, Precision: 0.5692, Recall: 0.6852, F1-score: 0.6218
Epoch 9/30, Loss: 0.4541
Validation Metrics -> Accuracy: 0.6550, Precision: 0.4242, Recall: 0.7778, F1-score: 0.5490
Epoch 10/30, Loss: 0.4334
Validation Metrics -> Accuracy: 0.6950, Precision: 0.4578, Recall: 0.7037, F1-score: 0.5547
Epoch 11/30, Loss: 0.4072
Validation Metrics -> Accuracy: 0.8050, Precision: 0.6829, Rec