In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import numpy as np  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
TEST_SIZE = 0.2
HATEMOJI_VALIDATION_PATH = r"E:\Cyberbullying\dataset\raw\HatemojiBuild\train.csv"
EMOTION_FUSION_MODEL_OUTPUT_DIR = './results/emotion_fusion_model'
RANDOM_STATE= 42
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [11]:
def load_hatemoji_validation_dataset(path=HATEMOJI_VALIDATION_PATH):
    """Loads the Hatemoji validation dataset."""
    df = pd.read_csv(path)
    df = df[['text', 'label_gold']].dropna()
    
    df.rename(columns={'label_gold': 'label'}, inplace=True) # Align column name
    
    print(f"Loaded Hatemoji validation dataset from {path}")
    print("Hatemoji Label distribution:\n", df['label'].value_counts())
    return df

In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import emoji
from tqdm.auto import tqdm # Use tqdm.auto for notebook/script compatibility

vader_analyzer = SentimentIntensityAnalyzer()
emotion_pipeline = None


def initialize_emotion_analyzer():
    global emotion_pipeline
    if emotion_pipeline is None:
        model_name = "j-hartmann/emotion-english-distilroberta-base" #
        try:
            # You might need to ensure model files are cached or downloaded before this step
            # cached_file("j-hartmann/emotion-english-distilroberta-base", "config.json", force_download=True)
            emotion_pipeline = pipeline( #
                "text-classification", #
                model=model_name, #
                return_all_scores=True, #
                framework="pt" #
            )
            print("✅ Emotion analysis pipeline loaded successfully!")
        except Exception as e: #
            print(f"❌ Failed to load emotion model: {e}") #
            emotion_pipeline = None

import numpy as np

# Ordered emotion labels from the model
EMOTION_LABELS = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

def extract_features(text, max_length=128):
    """Extracts VADER sentiment and emotion vector (7D) from text safely."""
    initialize_emotion_analyzer()

    try:
        text = limit_emoji_repeats(text)
        text_with_emojis = emoji.demojize(text)

        # VADER sentiment
        vader_score = vader_analyzer.polarity_scores(text_with_emojis)['compound']
    except Exception as e:
        print(f"[❌ VADER failed] {text[:80]}... | {e}")
        vader_score = 0.0

    emotion_vector = [0.0] * len(EMOTION_LABELS)

    if emotion_pipeline:
        try:
            # Truncate long text using tokenizer from pipeline
            tokenizer = emotion_pipeline.tokenizer
            if tokenizer:
                encoded = tokenizer(text_with_emojis, truncation=True, max_length=max_length, return_tensors="pt")
                decoded = tokenizer.decode(encoded["input_ids"][0], skip_special_tokens=True)
            else:
                decoded = text_with_emojis

            emotions = emotion_pipeline(decoded)
            # print(f"[DEBUG] text: {decoded[:80]}...\n[DEBUG] pipeline output: {emotions}\n")

            if not emotions or not isinstance(emotions[0], list):
                raise ValueError("Empty or invalid emotion output.")

            # Convert list of dicts to dict for safe access
            emotion_scores = {e['label']: e['score'] for e in emotions[0]}

            # Build vector in fixed label order
            emotion_vector = [emotion_scores.get(label, 0.0) for label in EMOTION_LABELS]

            # Sanity check
            if len(emotion_vector) != len(EMOTION_LABELS) or np.any(np.isnan(emotion_vector)) or np.any(np.isinf(emotion_vector)):
                raise ValueError("Invalid emotion vector content.")

        except Exception as e:
            print(f"[⚠️ Emotion extraction failed] {text[:80]}... | Reason: {e}")
            emotion_vector = [0.0] * len(EMOTION_LABELS)

    return vader_score, emotion_vector

def process_texts_for_emotion_features(df, text_column='text'):
    """Applies emotion and sentiment extraction to each row safely."""
    emoji_scores = []
    emotion_vectors = []

    print("🔍 Extracting emotion and sentiment features...")
    for t in tqdm(df[text_column], desc="Extracting Emotion Features"):
        score, vector = extract_features(t)
        emoji_scores.append(score)
        emotion_vectors.append(vector)

    df['emoji_score'] = emoji_scores
    df['emotion_vector'] = emotion_vectors

    # --- Determine expected emotion vector length dynamically ---
    valid_lengths = df['emotion_vector'].apply(lambda x: isinstance(x, list)).sum()
    vector_lengths = df['emotion_vector'].apply(lambda x: len(x) if isinstance(x, list) else -1)
    most_common_length = vector_lengths[vector_lengths != -1].mode().iloc[0] if not vector_lengths.empty else 0

    print(f"✅ Detected most common emotion vector length: {most_common_length}")

    # --- Clean data ---
    # Keep only rows with valid emoji score
    df = df[df['emoji_score'].apply(lambda x: isinstance(x, (int, float)) and not np.isnan(x))]

    # Keep only rows with valid-length, finite emotion vectors
    df = df[df['emotion_vector'].apply(
        lambda x: isinstance(x, list)
        and len(x) == most_common_length
        and all(np.isfinite(xi) for xi in x)
    )]

    print(f"🧹 After filtering: {df.shape[0]} valid rows retained.")
    return df

import re

def limit_emoji_repeats(text, max_repeat=5):
    emoji_pattern = r'(([\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF])+)\2{' + str(max_repeat) + ',}'
    return re.sub(emoji_pattern, r'\1'*max_repeat, text)


In [13]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Define the dataset class
class CyberbullyingFusionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, emoji_scores, emotion_vectors, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.emoji_scores = emoji_scores
        self.emotion_vectors = emotion_vectors
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure labels are float for BCELoss
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'emoji_score': torch.tensor(self.emoji_scores[idx], dtype=torch.float),
            'emotion_vector': torch.tensor(self.emotion_vectors[idx], dtype=torch.float),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float) # Changed from torch.long to torch.float
        }

# Define the fusion model
class BERTEmojiEmotionClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', emotion_dim=768, dropout_prob=0.1): # Default emotion_dim for now
        super(BERTEmojiEmotionClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(dropout_prob)

        # Assuming BERT's last hidden state is 768
        bert_output_dim = self.bert.config.hidden_size
        
        # Linear layer for emoji score
        self.emoji_fc = nn.Linear(1, 64) # Simple FC layer for emoji score
        
        # Linear layer for emotion vector (if it's not already 768 or compatible)
        # Adjust input dimension based on the actual size of your emotion vector
        self.emotion_fc = nn.Linear(emotion_dim, 128) # Map emotion_dim to a compatible size

        # Fusion layer: BERT output + Emoji FC output + Emotion FC output
        # Adjust the input dimension of the fusion_fc based on actual concatenated sizes
        # For simplicity, let's assume we map all to 256 for concatenation example
        # (bert_output_dim + 64 + 128) -> This is the input to fusion_fc
        self.fusion_fc = nn.Linear(bert_output_dim + 64 + 128, 256)
        
        self.classifier = nn.Linear(256, 1) # Output for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, emoji_score, emotion_vector):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :] # Use CLS token output
        
        emoji_features = self.emoji_fc(emoji_score.unsqueeze(1)) # Add a dimension for the single score
        
        # Ensure emotion_vector has the correct shape for the linear layer
        # If emotion_vector is already (batch_size, emotion_dim), no unsqueeze needed
        emotion_features = self.emotion_fc(emotion_vector)
        
        # Concatenate features
        # Ensure all features are 2D (batch_size, feature_dim) before concatenation
        combined_features = torch.cat((bert_output, emoji_features, emotion_features), dim=1)
        
        combined_features = self.dropout(combined_features)
        fusion_output = self.fusion_fc(combined_features)
        logits = self.classifier(fusion_output)
        return self.sigmoid(logits).squeeze(1) # Squeeze to make it (batch_size,)


# Training function
def train_fusion_model_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        emoji_score = batch['emoji_score'].to(device)
        emotion_vector = batch['emotion_vector'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, emoji_score, emotion_vector)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = (outputs > 0.5).float()
        correct_predictions += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

# Evaluation function
def evaluate_fusion_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emoji_score = batch['emoji_score'].to(device)
            emotion_vector = batch['emotion_vector'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, emoji_score, emotion_vector)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            predicted = (outputs > 0.5).float()
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    
    # Ensure all_labels and all_predictions are lists or numpy arrays before conversion
    all_labels_np = np.array(all_labels)
    all_predictions_np = np.array(all_predictions)

    # Convert to integer type for sklearn metrics if they were originally floats 0.0/1.0
    all_labels_int = all_labels_np.astype(int)
    all_predictions_int = all_predictions_np.astype(int)

    return avg_loss, all_predictions_int, all_labels_int

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def print_classification_metrics(y_true, y_pred, dataset_name="Dataset"):
    """Prints accuracy, precision, recall, F1-score, and the full classification report."""
    accuracy = accuracy_score(y_true, y_pred) #
    precision = precision_score(y_true, y_pred) #
    recall = recall_score(y_true, y_pred) #
    f1 = f1_score(y_true, y_pred) #

    print(f"\n--- Metrics for {dataset_name} ---")
    print(f"Accuracy:  {accuracy:.4f}") #
    print(f"Precision: {precision:.4f}") #
    print(f"Recall:    {recall:.4f}") #
    print(f"F1-Score:  {f1:.4f}") #
    print(f"\nClassification Report of {dataset_name}:\n", classification_report(y_true, y_pred)) #

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix", save_path=None):
    """Plots and optionally saves a confusion matrix."""
    cm = confusion_matrix(y_true, y_pred) #
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', #
                xticklabels=['Non-bullying', 'Bullying'], #
                yticklabels=['Non-bullying', 'Bullying']) #
    plt.xlabel('Predicted') #
    plt.ylabel('Actual') #
    plt.title(title) #
    if save_path:
        plt.savefig(save_path)
        print(f"Saved confusion matrix to {save_path}")
    plt.show() #
    plt.close()

def plot_training_history(train_losses, val_losses, train_accuracies, val_accuracies, title_prefix="", save_path=None): # From bilstm.ipynb
    """Plots training and validation loss/accuracy over epochs."""
    epochs = range(1, len(train_losses) + 1) #

    plt.figure(figsize=(12, 5)) #

    # Loss plot
    plt.subplot(1, 2, 1) #
    plt.plot(epochs, train_losses, label='Train Loss') #
    plt.plot(epochs, val_losses, label='Validation Loss') #
    plt.title(f'{title_prefix} Loss per Epoch') #
    plt.xlabel('Epoch') #
    plt.ylabel('Loss') #
    plt.legend() #

    # Accuracy plot
    plt.subplot(1, 2, 2) #
    plt.plot(epochs, train_accuracies, label='Train Accuracy') #
    plt.plot(epochs, val_accuracies, label='Validation Accuracy') #
    plt.title(f'{title_prefix} Accuracy per Epoch') #
    plt.xlabel('Epoch') #
    plt.ylabel('Accuracy') #
    plt.legend() #

    plt.tight_layout() #
    if save_path:
        plt.savefig(save_path)
        print(f"Saved training history plot to {save_path}")
    plt.show() #
    plt.close()

In [15]:

def run_emotion_fusion_model_experiment():
    print("\n" + "="*80)
    print("                Running Emotion Fusion Model Experiment               ")
    print("="*80 + "\n")

    df_fusion = load_hatemoji_validation_dataset()
    print(f"🔹 Loaded dataset: {df_fusion.shape[0]} rows")

    df_fusion = process_texts_for_emotion_features(df_fusion)
    print(f"🔹 After emotion feature extraction: {df_fusion.shape}")

    # Debug: check issues with emotion vectors
    zero_vectors = df_fusion['emotion_vector'].apply(lambda x: isinstance(x, list) and np.all(np.array(x) == 0.0)).sum()
    invalid_lengths = df_fusion['emotion_vector'].apply(lambda x: not isinstance(x, list) or len(x) != 7).sum()
    print(f"⚠️  Rows with all-zero emotion vectors: {zero_vectors}")
    print(f"❌ Rows with invalid-length vectors: {invalid_lengths}")
    print(f"✅ Rows with valid non-zero vectors: {df_fusion.shape[0] - zero_vectors - invalid_lengths}")

    # Filter only valid rows
    df_fusion = df_fusion[df_fusion['text'].notnull() & df_fusion['text'].str.strip().astype(bool)]
    df_fusion = df_fusion[df_fusion['emoji_score'].notnull()]
    df_fusion = df_fusion[df_fusion['emotion_vector'].apply(lambda x: isinstance(x, list) and len(x) == 7)]
    print(f"🧹 After cleaning: {df_fusion.shape[0]} rows remaining")

    if df_fusion.empty:
        print("❌ ERROR: All rows dropped during cleaning. Check emotion model or data quality.")
        return

    # Optional: Remove zero vectors if needed
    # df_fusion = df_fusion[df_fusion['emotion_vector'].apply(lambda x: sum(x) > 0.0)]

    # Tokenizer sanity check
    print("\n✅ Tokenizer input check:")
    print(f"Number of texts: {len(df_fusion['text'])}")
    print(f"Example text: {df_fusion['text'].iloc[0]}")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    encodings = tokenizer(list(df_fusion['text']), truncation=True, padding=True, max_length=128)

    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']

    train_data, test_data = train_test_split(
        df_fusion, test_size=TEST_SIZE, stratify=df_fusion['label'], random_state=RANDOM_STATE)

    # Create Dataset and DataLoader instances
    train_dataset = CyberbullyingFusionDataset(
        input_ids=tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)['input_ids'],
        attention_masks=tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)['attention_mask'],
        emoji_scores=list(train_data['emoji_score']),
        emotion_vectors=list(train_data['emotion_vector']),
        labels=list(train_data['label'])
    )

    test_dataset = CyberbullyingFusionDataset(
        input_ids=tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)['input_ids'],
        attention_masks=tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)['attention_mask'],
        emoji_scores=list(test_data['emoji_score']),
        emotion_vectors=list(test_data['emotion_vector']),
        labels=list(test_data['label'])
    )

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    model = BERTEmojiEmotionClassifier(emotion_dim=7).to(device)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    print("\n--- Training Emotion Fusion Model ---")
    for epoch in range(1, 6):
        train_loss, train_acc = train_fusion_model_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, y_pred, y_true = evaluate_fusion_model(model, test_loader, criterion, device)
        acc_val = accuracy_score(y_true, y_pred)
        prec_val = precision_score(y_true, y_pred)
        rec_val = recall_score(y_true, y_pred)
        f1_val = f1_score(y_true, y_pred)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(acc_val)

        print(f"Epoch {epoch}:")
        print(f"  Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
        print(f"  Val   Loss: {val_loss:.4f}, Accuracy: {acc_val:.4f}, Precision: {prec_val:.4f}, Recall: {rec_val:.4f}, F1: {f1_val:.4f}")

    plot_training_history(train_losses, val_losses, train_accuracies, val_accuracies,
                          "Emotion Fusion Model", save_path="./results/emotion_fusion_training_history.png")

    print("\n--- Evaluating Emotion Fusion Model ---")
    _, y_pred_fusion, y_true_fusion = evaluate_fusion_model(model, test_loader, criterion, device)
    print_classification_metrics(y_true_fusion, y_pred_fusion, "Emotion Fusion Model")
    plot_confusion_matrix(y_true_fusion, y_pred_fusion, "Confusion Matrix - Proposed Model",
                          save_path="./results/emotion_fusion_confusion_matrix.png")

    # Save the trained model
    torch.save(model.state_dict(), EMOTION_FUSION_MODEL_OUTPUT_DIR + "/emotion_fusion_model.pth")
    print(f"✅ Emotion Fusion model saved to {EMOTION_FUSION_MODEL_OUTPUT_DIR}/emotion_fusion_model.pth")


In [18]:
run_emotion_fusion_model_experiment()


                Running Emotion Fusion Model Experiment               

Loaded Hatemoji validation dataset from E:\Cyberbullying\dataset\raw\HatemojiBuild\train.csv
Hatemoji Label distribution:
 label
1    2387
0    2341
Name: count, dtype: int64
🔹 Loaded dataset: 4728 rows
🔍 Extracting emotion and sentiment features...


Extracting Emotion Features:  24%|██▍       | 1154/4728 [01:00<03:46, 15.76it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Extracting Emotion Features:  24%|██▍       | 1156/4728 [01:01<03:35, 16.57it/s]

Emotion analysis failed: index out of range in self
Emotion analysis failed: The expanded size of the tensor (524) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 524].  Tensor sizes: [1, 514]


Extracting Emotion Features: 100%|██████████| 4728/4728 [04:52<00:00, 16.15it/s]


✅ Detected most common emotion vector length: 7
🧹 After filtering: 4726 valid rows retained.
🔹 After emotion feature extraction: (4726, 4)
⚠️  Rows with all-zero emotion vectors: 0
❌ Rows with invalid-length vectors: 0
✅ Rows with valid non-zero vectors: 4726
🧹 After cleaning: 4726 rows remaining

✅ Tokenizer input check:
Number of texts: 4726
Example text: I would love to 🔪 some 👳🏾

--- Training Emotion Fusion Model ---


KeyboardInterrupt: 