In [None]:
import pandas as pd
import numpy as np
import torch
import re
from sklearn.model_selection import train_test_split
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix,classification_report
import spacy
nlp = spacy.load("en_core_web_sm") 

In [None]:
# Load your dataset (replace with Dataset-1 or Dataset-2 path)
df = pd.read_csv(r"E:\Cyberbullying\dataset\Dynamically Generated Hate Dataset v0.2.2.csv")
df = df[['text', 'label']]  # or ['text', 'label'] for Dataset-2

# Binary label mapping
# df['label'] = df['class'].apply(lambda x: 0 if x == 2 else 1)  # Dataset-1
df['label'] = df['label'].map({'nothate': 0, 'hate': 1})     # Dataset-2

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return tokens

df['tokens'] = df['text'].apply(clean_and_tokenize)

In [None]:
class SimpleVocab:
    def __init__(self, token_lists, min_freq=2):
        self.freqs = Counter()
        for tokens in token_lists:
            self.freqs.update(tokens)
        
        self.itos = ['<pad>', '<unk>'] + [tok for tok, freq in self.freqs.items() if freq >= min_freq]
        self.stoi = {tok: idx for idx, tok in enumerate(self.itos)}
    
    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi['<unk>'])
    
    def __len__(self):
        return len(self.itos)

# Usage:
vocab = SimpleVocab(df['tokens'].tolist(), min_freq=2)
PAD_IDX = vocab['<pad>']
UNK_IDX = vocab['<unk>']


In [None]:
def load_glove(path, vocab, dim=100):
    glove = {}
    with open(path, encoding='utf8') as f:
        for line in f:
            tokens = line.split()
            word = tokens[0]
            vec = np.array(tokens[1:], dtype=np.float32)
            glove[word] = vec

    embedding_matrix = np.zeros((len(vocab), dim), dtype=np.float32)
    for word, idx in vocab.stoi.items():
        if word in glove:
            embedding_matrix[idx] = glove[word]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(dim,)).astype(np.float32)

    return torch.tensor(embedding_matrix, dtype=torch.float32)


embedding_matrix = load_glove(r"E:\Cyberbullying\glove\glove.6B.100d.txt", vocab, dim=100)


In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        indices = [vocab[token] for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float32)

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(x) for x in texts]
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=PAD_IDX)

    return (
        padded_texts,  # LongTensor
        torch.stack(labels),  # Proper stacking without changing dtype
        torch.tensor(lengths, dtype=torch.long)
    )


# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['tokens'], df['label'], test_size=0.2, stratify=df['label'])
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, dropout=0.5):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_output, (hidden, _) = self.lstm(embedded)
        hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        out = self.dropout(hidden_cat)
        return self.sigmoid(self.fc(out)).squeeze()


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BiLSTMClassifier(embedding_matrix, hidden_dim=128, output_dim=1).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_model(model, loader):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for texts, labels, lengths in loader:
        texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
        optimizer.zero_grad()
        outputs = model(texts, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = (outputs >= 0.5).long()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    return total_loss / len(loader), acc



In [None]:
def eval_model(model, loader):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for texts, labels, lengths in loader:
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            preds = (outputs >= 0.5).long()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    return total_loss / len(loader), acc


In [None]:
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []

for epoch in range(1, 6):
    train_loss, train_acc = train_model(model, train_loader)
    val_loss, val_acc = eval_model(model, test_loader)

    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch}: Train Loss {train_loss:.4f}, Acc {train_acc:.4f} | Val Loss {val_loss:.4f}, Acc {val_acc:.4f}")


In [None]:


def evaluate_model(model, loader):
    model.eval()
    preds = []
    true_labels = []

    with torch.no_grad():
        for texts, labels, lengths in loader:
            texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
            outputs = model(texts, lengths)
            predicted = (outputs >= 0.5).long()
            preds.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    acc = accuracy_score(true_labels, preds)
    prec = precision_score(true_labels, preds)
    rec = recall_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)

    # Print classification report
    print("\n🔍 Classification Report:")
    print(classification_report(true_labels, preds, target_names=["Non-Bullying", "Bullying"]))

    # Confusion Matrix
    cm = confusion_matrix(true_labels, preds)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Predicted: Non-Bullying', 'Predicted: Bullying'],
                yticklabels=['Actual: Non-Bullying', 'Actual: Bullying'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    # Return for logging
    return acc, prec, rec, f1


In [None]:

epochs = range(1, len(train_losses)+1)

plt.figure(figsize=(12, 5))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.title('Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Train Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.title('Accuracy per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:


def plot_confusion_matrix(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for texts, batch_labels, lengths in loader:
            texts, batch_labels, lengths = texts.to(device), batch_labels.to(device), lengths.to(device)
            outputs = model(texts, lengths)
            predicted = (outputs >= 0.5).long()
            preds.extend(predicted.cpu().numpy())
            labels.extend(batch_labels.cpu().numpy())

    cm = confusion_matrix(labels, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-bullying', 'Bullying'], yticklabels=['Non-bullying', 'Bullying'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

# After training
plot_confusion_matrix(model, test_loader)
