In [1]:
import os
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0


In [2]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.optim import Adam
from accelerate import Accelerator
import wandb
import gc
import torch
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score

In [3]:
from evaluate import load
from rouge_score import rouge_scorer
from bert_score import score as bert_score

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [4]:
accelerator = Accelerator()

In [5]:
#Training data, validation data and testing data from IntentKonen dataset

training_data = pd.read_csv('../../Dataset/train.csv')
testing_data = pd.read_csv('../../Dataset/test.csv')
validation_data = pd.read_csv('../../Dataset/validation.csv')

In [6]:
class DialoGPTDataset(Dataset):
    def __init__(self, data):
        self.data = data

        #Loading the small version of DialoGPT tokenizer so that it is easier to run
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small", token='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')

        #Input attributes from the dataset
        self.input_attributes = ['hatespeech', 'csType']

        #Output attributes from the dataset
        self.output_attributes = ['counterspeech']
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        #Format used for framing the input text for tokenization
        input_text = f'Hate: {row["hatespeech"]} Type: {row["csType"]}'
        counter_speech = row["counterspeech"]

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=128, truncation=True, padding="max_length")
        counter_speech_ids = self.tokenizer.encode(counter_speech, return_tensors='pt', max_length=128, truncation=True, padding="max_length")

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
            
        }

    def __len__(self):
        return len(self.data)

In [7]:
train_dataset = DialoGPTDataset(training_data)
test_dataset = DialoGPTDataset(testing_data)
validation_dataset = DialoGPTDataset(validation_data)

print(len(train_dataset))
print(len(test_dataset))
print(len(validation_dataset))

9532
2971
1470


In [8]:
model = AutoModelForCausalLM.from_pretrained("trainer_final_checkpoint")
tokenizer = AutoTokenizer.from_pretrained("trainer_final_checkpoint")

In [9]:
#This cell contains the actual training arguments and the Trainer object that was used to train the model and this was the same arguments used in the paper

# training_args = TrainingArguments(
#     num_train_epochs=20,
#     per_device_train_batch_size=32,
#     learning_rate=8e-5,
#     weight_decay=0.03,
#     save_strategy="epoch",
#     logging_dir='./logs',
#     logging_steps=10,
#     output_dir='./dialogpt_logs',
# )

# trainer = Trainer(
#     model=dialogpt_model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=validation_dataset,
#     optimizers=(optimizer, None)
# )

In [10]:
#Place holder training argument just to load the model

training_args = TrainingArguments(
    output_dir='./trainer_checkpoints',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    num_train_epochs=3,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [11]:
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar
import gc
import torch

# Create DataLoader with smaller batch size
batch_size = 8
dataloader = DataLoader(test_dataset, batch_size=batch_size)

all_predictions = []
all_references = []

model.eval()

# Get the total number of sentences
total_sentences = len(test_dataset)

# Batch inference with memory clearing and progress tracking
with torch.no_grad():
    completed_sentences = 0
    with tqdm(total=total_sentences, desc="Processing Sentences") as pbar:
        for batch in dataloader:
            # Move batch to GPU
            batch = {k: v.to("cuda") for k, v in batch.items()}

            # Perform inference
            outputs = model(**batch)

            # Move tensors back to CPU immediately to free GPU memory
            all_predictions.append(outputs.logits.cpu())
            all_references.append(batch['labels'].cpu())

            # Update progress bar
            batch_size = batch['input_ids'].size(0)
            completed_sentences += batch_size
            pbar.update(batch_size)

            # Force memory release
            del batch
            del outputs
            torch.cuda.empty_cache()
            gc.collect()

# Print completion message
print(f"Inference completed for {completed_sentences} sentences.")




  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Processing Sentences: 100%|██████████████████████████████████████████████████████████| 2971/2971 [01:40<00:00, 29.50it/s]

Inference completed for 2971 sentences.





In [16]:
# Concatenate predictions and references
predictions = torch.cat(all_predictions)
references = torch.cat(all_references)

predictions = predictions.squeeze(1)  # Shape: (total_samples, 128)
references = references.squeeze(1)

predictions = predictions.argmax(dim=-1)

# Decode predictions and references
decoded_predictions = test_dataset.tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_references = test_dataset.tokenizer.batch_decode(references, skip_special_tokens=True)

In [18]:
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(decoded_references, decoded_predictions)]
rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

print(f"ROUGE-1: {rouge1:.4f}")
print(f"ROUGE-2: {rouge2:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")

# --------------------------------
# Calculate METEOR score
# --------------------------------
meteor_scores = []
for ref, pred in zip(decoded_references, decoded_predictions):
    # Tokenize both reference and hypothesis (prediction)
    tokenized_ref = ref.split()
    tokenized_pred = pred.split()
    meteor_scores.append(meteor_score([tokenized_ref], tokenized_pred))
avg_meteor = np.mean(meteor_scores)
print(f"METEOR: {avg_meteor:.4f}")
# --------------------------------
# Calculate BERTScore
# --------------------------------
P, R, F1 = bert_score(cands=decoded_predictions, refs=decoded_references, lang='en', verbose=True)
print(f"BERTScore - P: {P.mean():.4f}, R: {R.mean():.4f}, F1: {F1.mean():.4f}")

ROUGE-1: 0.1297
ROUGE-2: 0.0030
ROUGE-L: 0.1045
METEOR: 0.0404


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/75 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 9.19 seconds, 323.26 sentences/sec
BERTScore - P: 0.7914, R: 0.8076, F1: 0.7993


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
judge_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
judge_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small").to(device)
judge_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [20]:
fixed_intents = ["informative", "questioning", "denouncing", "positive"]

testing_data = pd.read_csv('../../Dataset/test.csv')

gold_label_map = {}
for _, row in testing_data.iterrows():
    hs = row['hatespeech']
    intent = row['csType'].lower()
    if hs not in gold_label_map:
        gold_label_map[hs] = set()
    gold_label_map[hs].add(intent)

hs_intent_pairs = []
all_inputs = []

for hs in gold_label_map:
    for intent in fixed_intents:
        prompt = f"Hate: {hs} Type: {intent}"
        hs_intent_pairs.append((hs, intent))
        all_inputs.append(prompt)

generated_cs = {}
batch_size = 8
dataloader = DataLoader(all_inputs, batch_size=batch_size)

model.to(device)
model.eval()
tokenizer.padding_side = 'left'
generated_texts = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Generating Counterspeech"):
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        generated_texts.extend(decoded)
        torch.cuda.empty_cache()
        gc.collect()

# Organize generated responses by hate speech
for (hs, intent), response in zip(hs_intent_pairs, generated_texts):
    if hs not in generated_cs:
        generated_cs[hs] = {}
    generated_cs[hs][intent] = response

# Evaluate using Judge LLM
correct = 0
total = 0

print("\nEvaluating with Judge LLM...\n")
for hs, intent_responses in tqdm(generated_cs.items(), desc="Scoring with Judge LLM"):
    intent_scores = {}
    gold_labels = gold_label_map.get(hs, [])

    for intent, cs in intent_responses.items():
        prompt = (
            f"Hate speech: {hs}\n"
            f"Proposed counterspeech (intent: {intent}): {cs}\n\n"
            f"On a scale of 1 to 10, how appropriate and effective is this counterspeech in response to the hate speech? Just respond with a number."
        )

        input_ids = judge_tokenizer.encode(prompt, return_tensors="pt").to(device)
        output_ids = judge_model.generate(input_ids, max_new_tokens=10, pad_token_id=judge_tokenizer.eos_token_id)
        score_text = judge_tokenizer.decode(output_ids[0], skip_special_tokens=True)

        try:
            score = next((float(s) for s in score_text.split() if s.replace('.', '', 1).isdigit()), 0)
            score = max(0, min(score, 10))
        except:
            score = 0

        intent_scores[intent] = score

    # Select best intent by highest score
    best_intent = max(intent_scores, key=intent_scores.get)

    if best_intent in gold_labels:
        correct += 1
    total += 1

# Final Accuracy
accuracy = correct / total if total else 0
print(f"\nFinal Category Accuracy using Judge LLM: {accuracy:.4f}")

Generating Counterspeech: 100%|████████████████████████████████████████████████████████| 549/549 [02:59<00:00,  3.06it/s]



Evaluating with Judge LLM...



Scoring with Judge LLM:   0%|                                                                   | 0/1097 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Scoring with Judge LLM: 100%|████████████████████████████████████████████████████████| 1097/1097 [00:42<00:00, 26.04it/s]


Final Category Accuracy using Judge LLM: 0.6809





In [23]:
# Metrics:
# Final Category Accuracy using Judge LLM: 0.6809
# ROUGE-1: 0.1297
# ROUGE-2: 0.0030
# ROUGE-L: 0.1045
# METEOR: 0.0404
# BERTScore - P: 0.7914, R: 0.8076, F1: 0.7993