In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv(r"E:\Cyberbullying\dataset\HatemojiBuild\validation.csv")

# Filter only needed columns
df = df[['text', 'label_gold']].dropna()

# Binary labels
df['label'] = df['label_gold']

In [None]:
from transformers.utils import cached_file
cached_file("j-hartmann/emotion-english-distilroberta-base", "config.json", force_download=True)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "j-hartmann/emotion-english-distilroberta-base"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    print("✅ Model loaded successfully!")
except Exception as e:
    print("❌ Model loading failed:", e)


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import emoji

# Load tools
vader = SentimentIntensityAnalyzer()

# Optional: Use explicit model and tokenizer loading for more control
try:
    emotion_analyzer = pipeline(
        "text-classification",
        model="j-hartmann/emotion-english-distilroberta-base",
        return_all_scores=True,
        framework="pt"  # or "tf" if you use TensorFlow
    )
except Exception as e:
    print("Failed to load emotion model:", e)
    emotion_analyzer = None

def extract_features(text):
    # Step 1: Convert emojis to text
    text_with_emojis = emoji.demojize(text)

    # Step 2: VADER sentiment score
    vader_score = vader.polarity_scores(text_with_emojis)['compound']

    # Step 3: Emotion analysis
    emotion_vector = []
    if emotion_analyzer:
        try:
            emotions = emotion_analyzer(text_with_emojis)
            emotion_vector = [e['score'] for e in emotions[0]]
        except Exception as e:
            print("Emotion analysis failed:", e)
            emotion_vector = [0.0] * 6  # Assuming 6 emotion classes as placeholder

    return vader_score, emotion_vector


In [None]:
from tqdm import tqdm

emoji_scores = []
emotion_vectors = []

for t in tqdm(df['text']):
    s, e = extract_features(t)
    emoji_scores.append(s)
    emotion_vectors.append(e)

df['emoji_score'] = emoji_scores
df['emotion_vector'] = emotion_vectors

In [None]:
df = df.dropna(subset=['text', 'emoji_score', 'emotion_vector'])  # just in case


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encodings = tokenizer(
    list(df['text']), 
    truncation=True, 
    padding=True, 
    max_length=128
)

input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']


In [None]:
import torch
from torch.utils.data import Dataset

class CyberbullyingFusionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, emoji_scores, emotion_vectors, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.emoji_scores = emoji_scores
        self.emotion_vectors = emotion_vectors
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'emoji_score': torch.tensor(self.emoji_scores[idx], dtype=torch.float32),
            'emotion_vector': torch.tensor(self.emotion_vectors[idx], dtype=torch.float32),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float32)
        }


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = CyberbullyingFusionDataset(
    input_ids=tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)['input_ids'],
    attention_masks=tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)['attention_mask'],
    emoji_scores=list(train_data['emoji_score']),
    emotion_vectors=list(train_data['emotion_vector']),
    labels=list(train_data['label'])
)

test_dataset = CyberbullyingFusionDataset(
    input_ids=tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)['input_ids'],
    attention_masks=tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)['attention_mask'],
    emoji_scores=list(test_data['emoji_score']),
    emotion_vectors=list(test_data['emotion_vector']),
    labels=list(test_data['label'])
)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BERTEmojiEmotionClassifier(nn.Module):
    def __init__(self, emotion_dim=6, hidden_dim=256, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Optional: project emotion and emoji features if needed
        self.emoji_proj = nn.Linear(1, 8)
        self.emotion_proj = nn.Linear(emotion_dim, 32)

        # Fusion + classifier
        self.classifier = nn.Sequential(
            nn.Linear(768 + 8 + 32, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, emoji_score, emotion_vector):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0]  # [CLS] token

        emoji_feat = self.emoji_proj(emoji_score.unsqueeze(1))  # shape: [B, 8]
        emotion_feat = self.emotion_proj(emotion_vector)        # shape: [B, 32]

        combined = torch.cat((cls_embedding, emoji_feat, emotion_feat), dim=1)
        out = self.classifier(combined)
        return out.squeeze()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTEmojiEmotionClassifier(emotion_dim=len(train_data['emotion_vector'].iloc[0])).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

def train_one_epoch(model, loader):
    model.train()
    total_loss, correct = 0, 0

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        emoji_score = batch['emoji_score'].to(device)
        emotion_vector = batch['emotion_vector'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, emoji_score, emotion_vector)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = (outputs >= 0.5).float()
        correct += (preds == labels).sum().item()

    acc = correct / len(loader.dataset)
    return total_loss / len(loader), acc


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emoji_score = batch['emoji_score'].to(device)
            emotion_vector = batch['emotion_vector'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, emoji_score, emotion_vector)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = (outputs >= 0.5).long().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    return acc, prec, rec, f1, total_loss / len(loader), all_preds, all_labels


In [None]:
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(1, 6):
    train_loss, train_acc = train_one_epoch(model, train_loader)
    acc, prec, rec, f1, val_loss, _, _ = evaluate_model(model, test_loader)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(acc)

    print(f"Epoch {epoch}:")
    print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"  Val   Loss: {val_loss:.4f}, Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")


In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(train_losses) + 1)

plt.figure(figsize=(12, 5))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses, label='Val Loss')
plt.title('Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Train Acc')
plt.plot(epochs, val_accuracies, label='Val Acc')
plt.title('Accuracy per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# After final evaluation
acc, prec, rec, f1, _, y_pred, y_true = evaluate_model(model, test_loader)

cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Bullying', 'Bullying'],
            yticklabels=['Non-Bullying', 'Bullying'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Proposed Model")
plt.show()
