In [None]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments, BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput
from torch.optim import Adam
from accelerate import Accelerator
import wandb
from tqdm import tqdm
from torch.optim import AdamW
import torch.nn as nn 

import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [None]:
from evaluate import load
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# meteor = load("meteor")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
accelerator = Accelerator()

In [None]:
training_data = pd.read_csv('../../Dataset/train.csv')
testing_data = pd.read_csv('../../Dataset/test.csv')
validation_data = pd.read_csv('../../Dataset/validation.csv')

In [None]:
class DialoGPTDataset(Dataset):
    def __init__(self, data):
        self.data = data.reset_index(drop=True)
        self.tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")
        self.bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

        # Intent label mapping
        self.categories = {
            'informative': 0,
            'questioning': 1,
            'denouncing': 2,
            'positive': 3
        }

        # âœ… Create a mapping from hate speech â†’ list of intent labels
        self.intent_map = (
            data.groupby("hatespeech")["csType"]
            .apply(lambda x: [self.categories[t.lower()] for t in x.unique()])
            .to_dict()
        )

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Tokenize hate speech
        hate_inputs = self.tokenizer(
            row["hatespeech"],
            return_tensors='pt',
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        # Tokenize counterspeech
        counter_inputs = self.bart_tokenizer(
            row["counterspeech"],
            return_tensors='pt',
            max_length=128,
            truncation=True,
            padding="max_length"
        )

        intent_id = torch.tensor(self.categories[row["csType"].lower()], dtype=torch.long)
        all_intents = self.intent_map[row["hatespeech"]]  # âœ… Look up all intents for this hate speech

        return {
            'input_ids': hate_inputs['input_ids'].squeeze(0),
            'attention_mask': hate_inputs['attention_mask'].squeeze(0),
            'counter_speech': counter_inputs['input_ids'].squeeze(0),
            'intent_id': intent_id,
            'raw_text': row["hatespeech"],
            'all_intents': all_intents  
        }

    def __len__(self):
        return len(self.data)


def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    counter_speech = torch.stack([item['counter_speech'] for item in batch])
    intent_id = torch.stack([item['intent_id'] for item in batch])
    all_intents = [item['all_intents'] for item in batch]
    raw_inputs = [item['raw_text'] for item in batch]

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'counter_speech': counter_speech,
        'intent_id': intent_id,
        'all_intents': all_intents,
        'raw_inputs':raw_inputs
    }



In [None]:
class FeatureEncoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, next_input):
        super(FeatureEncoder, self).__init__()
        self.model = AutoModel.from_pretrained('GroNLP/hateBERT')
        self.hidden_dim = hidden_dim
        self.output_size = next_input

        self.informative_head = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, self.output_size),
            torch.nn.ReLU(),
        )

        self.questioning_head = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, self.output_size),
            torch.nn.ReLU(),
        )

        self.denouncing_head = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, self.output_size),
            torch.nn.ReLU(),
        )

        self.positive_head = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, self.output_size),
            torch.nn.ReLU(),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hate_speech_h = outputs.last_hidden_state[:, 0, :]

        informative_e = self.informative_head(hate_speech_h)
        questioning_e = self.questioning_head(hate_speech_h)
        denouncing_e = self.denouncing_head(hate_speech_h)
        positive_e = self.positive_head(hate_speech_h)

        return informative_e, questioning_e, denouncing_e, positive_e, hate_speech_h

In [None]:
class CrossAttentionFusion(torch.nn.Module):
    def __init__(self, hate_speech_dim, intent_dim, output_dim, num_heads=4):
        super(CrossAttentionFusion, self).__init__()
        self.num_heads = num_heads
        # Ensure dimensions are divisible by num_heads
        self.head_dim = intent_dim // num_heads
        self.scale = self.head_dim ** -0.5
        
        # Multi-head projection layers
        self.q_proj = torch.nn.Linear(hate_speech_dim, intent_dim)
        self.k_proj = torch.nn.Linear(intent_dim, intent_dim)
        self.v_proj = torch.nn.Linear(intent_dim, intent_dim)
        self.out_proj = torch.nn.Linear(intent_dim, output_dim)
        
        # Layer normalization and feedforward
        self.layer_norm1 = torch.nn.LayerNorm(intent_dim)
        self.layer_norm2 = torch.nn.LayerNorm(intent_dim)
        self.ffn = torch.nn.Sequential(
            torch.nn.Linear(intent_dim, intent_dim * 4),
            torch.nn.ReLU(),
            torch.nn.Linear(intent_dim * 4, intent_dim)
        )
        
    def forward(self, hate_speech_h, intent_e):
        """
        hate_speech_h: hate speech embedding [hate_speech_dim]
        intent_e: intent embedding [intent_dim]
        """
        # Add batch dimension if not present
        if hate_speech_h.dim() == 1:
            hate_speech_h = hate_speech_h.unsqueeze(0)  
        if intent_e.dim() == 1:
            intent_e = intent_e.unsqueeze(0) 
            
        batch_size = hate_speech_h.shape[0]
 
        q = self.q_proj(hate_speech_h)  
        
        # Project key and value from intent embedding
        k = self.k_proj(intent_e)  # [batch_size, intent_dim]
        v = self.v_proj(intent_e)  # [batch_size, intent_dim]
        
        # Reshape for multi-head attention
        q = q.view(batch_size, self.num_heads, self.head_dim)  # [batch_size, num_heads, head_dim]
        k = k.view(batch_size, self.num_heads, self.head_dim)  
        v = v.view(batch_size, self.num_heads, self.head_dim)  
        
        # Reshape for attention computation (adding sequence length dimension of 1)
        q = q.unsqueeze(2)  # [batch_size, num_heads, 1, head_dim]
        k = k.unsqueeze(2)  
        v = v.unsqueeze(2)  
        
        # Compute attention scores
        attention_scores = torch.matmul(q, k.transpose(-1, -2)) * self.scale  # [batch_size, num_heads, 1, 1]
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
        
        # Apply attention weights to values
        context = torch.matmul(attention_weights, v)  # [batch_size, num_heads, 1, head_dim]
        context = context.squeeze(2).view(batch_size, -1)  # [batch_size, intent_dim]
        
        # First residual block
        norm_context = self.layer_norm1(context + intent_e)
        
        # FFN block
        ffn_output = self.ffn(norm_context)
        
        # Second residual block
        final_output = self.layer_norm2(norm_context + ffn_output)
        
        # Project to output dimension
        output = self.out_proj(final_output)
        
        # Remove batch dimension if it was added
        if hate_speech_h.shape[0] == 1 and intent_e.shape[0] == 1:
            output = output.squeeze(0)
            
        return output

In [None]:
class CounterSpeechNetwork(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, encoder_output, max_length):
        super(CounterSpeechNetwork, self).__init__()

        self.feature_encoder = FeatureEncoder(input_dim, hidden_dim, encoder_output)

        self.informative_decoder = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.questioning_decoder = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.denouncing_decoder = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.positive_decoder = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

        bart_dim = self.informative_decoder.config.d_model
        self.informative_fusion = CrossAttentionFusion(hidden_dim, encoder_output, bart_dim)
        self.questioning_fusion = CrossAttentionFusion(hidden_dim, encoder_output, bart_dim)
        self.denouncing_fusion = CrossAttentionFusion(hidden_dim, encoder_output, bart_dim)
        self.positive_fusion = CrossAttentionFusion(hidden_dim, encoder_output, bart_dim)

        self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
        self.max_length = max_length

    def forward(self, input_ids, attention_mask, intent_id, counter_speech=None):
        informative_e, questioning_e, denouncing_e, positive_e, hate_speech_h = self.feature_encoder(input_ids, attention_mask)
        batch_size = input_ids.size(0)

        fused = torch.zeros(batch_size, 1, self.informative_decoder.config.d_model, device=input_ids.device)

        for i in range(batch_size):
            if intent_id[i] == 0:
                # Use cross-attention instead of concatenation
                fused[i] = self.informative_fusion(hate_speech_h[i], informative_e[i]).unsqueeze(0)
            elif intent_id[i] == 1:
                fused[i] = self.questioning_fusion(hate_speech_h[i], questioning_e[i]).unsqueeze(0)
            elif intent_id[i] == 2:
                fused[i] = self.denouncing_fusion(hate_speech_h[i], denouncing_e[i]).unsqueeze(0)
            elif intent_id[i] == 3:
                fused[i] = self.positive_fusion(hate_speech_h[i], positive_e[i]).unsqueeze(0)
            else:
                raise ValueError(f"Invalid intent_id: {intent_id[i]}")

        if counter_speech is not None:
            losses = []
            for i in range(batch_size):
                if intent_id[i] == 0:
                    output = self.informative_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0))
                elif intent_id[i] == 1:
                    output = self.questioning_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0))
                elif intent_id[i] == 2:
                    output = self.denouncing_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0))
                elif intent_id[i] == 3:
                    output = self.positive_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0))
                losses.append(output.loss)
            avg_loss = sum(losses) / len(losses)  # Average loss across the batch
            return None, avg_loss  # No decoded text during training
        else:
            decoded_texts = []
            for i in range(batch_size):
                if intent_id[i] == 0:
                    output = self.informative_decoder.generate(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), max_length=self.max_length, num_beams=4, early_stopping=True)
                elif intent_id[i] == 1:
                    output = self.questioning_decoder.generate(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), max_length=self.max_length, num_beams=4, early_stopping=True)
                elif intent_id[i] == 2:
                    output = self.denouncing_decoder.generate(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), max_length=self.max_length, num_beams=4, early_stopping=True)
                elif intent_id[i] == 3:
                    output = self.positive_decoder.generate(encoder_outputs=BaseModelOutput(last_hidden_state=fused[i].unsqueeze(0)), max_length=self.max_length, num_beams=4, early_stopping=True)
                decoded_texts.append(self.tokenizer.decode(output[0], skip_special_tokens=True))
            return decoded_texts, None  # Decoded text during inference, no loss
        
    def judge_responses(self,input_ids, attention_mask,counter_speech=None):
        
        informative_e, questioning_e, denouncing_e, positive_e, hate_speech_h = self.feature_encoder(input_ids, attention_mask)

        batch_size = input_ids.size(0)
        output_dict = {'informative':[],'questioning':[],'denouncing':[],'positive':[]}
        informative_fused = torch.zeros(batch_size, 1, self.informative_decoder.config.d_model, device=input_ids.device)
        questioning_fused = torch.zeros(batch_size, 1, self.informative_decoder.config.d_model, device=input_ids.device)
        denouncing_fused = torch.zeros(batch_size, 1, self.informative_decoder.config.d_model, device=input_ids.device)
        positive_fused = torch.zeros(batch_size, 1, self.informative_decoder.config.d_model, device=input_ids.device)
        
    
        for i in range(batch_size):
            informative_fused[i] = self.informative_fusion( hate_speech_h[i], informative_e[i]).unsqueeze(0)
            questioning_fused[i] = self.questioning_fusion( hate_speech_h[i], questioning_e[i]).unsqueeze(0)
            denouncing_fused[i] = self.denouncing_fusion(hate_speech_h[i], denouncing_e[i]).unsqueeze(0)
            positive_fused[i] = self.positive_fusion(hate_speech_h[i] , positive_e[i]).unsqueeze(0)


            output_dict['informative'].append(self.informative_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=informative_fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0)))
            output_dict['questioning'].append(self.questioning_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=questioning_fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0)))
            output_dict['denouncing'].append(self.denouncing_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=denouncing_fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0)))
            output_dict['positive'].append(self.positive_decoder(encoder_outputs=BaseModelOutput(last_hidden_state=positive_fused[i].unsqueeze(0)), labels=counter_speech[i].unsqueeze(0)))

        return output_dict

In [None]:
test_dataset = DialoGPTDataset(testing_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

In [None]:
def evaluate_with_judge_model(model, test_dataloader, device):
    categories = {
        'informative': 0,
        'questioning': 1,
        'denouncing': 2,
        'positive': 3
    }

    model.to(device)
    model.eval()

    from transformers import AutoTokenizer, AutoModelForCausalLM
    judge_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
    judge_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small").to(device)
    judge_model.eval()
    print("Judge model ready!")

    total_samples = 0
    correct_predictions = 0

    test_loop = tqdm(test_dataloader, desc="Test Evaluation", leave=True)
    with torch.no_grad():
        for batch in test_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            counter_speech = batch['counter_speech'].to(device)
            raw_inputs = batch['raw_inputs']
            all_intents = batch['all_intents']  # list of gold intents per sample (as int indices)

            outputs = model.judge_responses(input_ids, attention_mask, counter_speech)

            for i in range(len(input_ids)):
                intent_scores = {}

                for intent in ["informative", "questioning", "denouncing", "positive"]:
                    response = model.tokenizer.decode(
                        torch.argmax(outputs[intent][i].logits, dim=-1)[0],
                        skip_special_tokens=True
                    )

                    # Prompting for a rating
                    prompt = (
                        f"Hate speech: {raw_inputs[i]}\n"
                        f"Proposed counterspeech (intent: {intent}): {response}\n\n"
                        f"On a scale of 1 to 10, how appropriate and effective is this counterspeech in response to the hate speech? Just respond with a number."
                    )

                    judge_input = judge_tokenizer.encode(prompt, return_tensors='pt').to(device)
                    output_ids = judge_model.generate(judge_input, max_new_tokens=10, pad_token_id=judge_tokenizer.eos_token_id)
                    score_text = judge_tokenizer.decode(output_ids[0], skip_special_tokens=True)

                    try:
                        # Extract the first number in response (robust to "I would give it a 7" etc.)
                        score = next((float(s) for s in score_text.split() if s.replace('.', '', 1).isdigit()), 0)
                        score = max(0, min(score, 10))  # Clamp between 0 and 10
                    except:
                        score = 0  # Fallback score if judge LM fails

                    intent_scores[intent] = score

                # Pick best scoring intent
                best_intent = max(intent_scores, key=intent_scores.get)
                best_intent_idx = categories[best_intent]

                if best_intent_idx in all_intents[i]:
                    correct_predictions += 1
                total_samples += 1

            test_loop.set_postfix({'accuracy': correct_predictions / total_samples if total_samples else 0})

    final_accuracy = correct_predictions / total_samples if total_samples else 0
    return final_accuracy


In [None]:
# Download required data for METEOR
nltk.download('wordnet')
nltk.download('punkt')

# Initialize model
model = CounterSpeechNetwork(input_dim=128, hidden_dim=768, encoder_output=256, max_length=50)
model.load_state_dict(torch.load("HateBERT_cross_attention_final.pth", map_location="cuda" if torch.cuda.is_available() else "cpu"))
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluation loop
test_predictions = []
test_references = []

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
meteor_scores = []
cosine_sims = []

test_loop = tqdm(test_dataloader, desc="Test Evaluation", leave=True)

with torch.no_grad():
    for batch in test_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        counter_speech = batch['counter_speech'].to(device)  # Reference texts
        intent_ids = batch['intent_id'].to(device)

        predictions, logits = model(input_ids, attention_mask, intent_ids)
        
        # Decode predictions and references
        pred_texts = predictions
        ref_texts = [model.tokenizer.decode(cs, skip_special_tokens=True) for cs in counter_speech]

        test_predictions.extend(pred_texts)
        test_references.extend(ref_texts)

        # Compute METEOR and Cosine Similarity
        for pred, ref in zip(pred_texts, ref_texts):
            score = meteor_score([ref.split()], pred.split())
            meteor_scores.append(score)

            # Cosine similarity using simple TF representation
            pred_vec = model.tokenizer(pred, return_tensors="pt", padding=True, truncation=True)['input_ids'].float()
            ref_vec = model.tokenizer(ref, return_tensors="pt", padding=True, truncation=True)['input_ids'].float()

            pred_vec = normalize(torch.sum(pred_vec, dim=1).numpy().reshape(1, -1))
            ref_vec = normalize(torch.sum(ref_vec, dim=1).numpy().reshape(1, -1))
            cos_sim = cosine_similarity(pred_vec, ref_vec)[0][0]
            cosine_sims.append(cos_sim)


In [None]:
# Compute BERTScore
P, R, F1 = bert_score(test_predictions, test_references, lang="en", verbose=True)

# Compute ROUGE
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
for pred, ref in zip(test_predictions, test_references):
    scores = scorer.score(ref, pred)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
avg_meteor = sum(meteor_scores) / len(meteor_scores)
avg_cosine = sum(cosine_sims) / len(cosine_sims)

# Compute category (intent) accuracy
intent_accuracy = evaluate_with_judge_model(model,test_dataloader,device)

# Display results
print(f"\n=== Evaluation Metrics ===")
print(f"Total Predictions: {len(test_predictions)}")
print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
print(f"ROUGE - Rouge-1: {avg_rouge1:.4f}, Rouge-2: {avg_rouge2:.4f}, Rouge-L: {avg_rougeL:.4f}")
print(f"METEOR: {avg_meteor:.4f}")
print(f"Cosine Similarity: {avg_cosine:.4f}")
print(f"Category Accuracy (Intent): {intent_accuracy:.4f}")

In [None]:
# Save test predictions to a text file
model_name = "HateBERT-cross-attention"
txt_filename = f"predictions_{model_name}.txt"
with open(txt_filename, "w", encoding="utf-8") as f:
    for pred in test_predictions:
        f.write(pred.strip() + "\n")

print(f"ðŸ“„ Saved predictions to {txt_filename}")
