In [1]:
import os
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0


In [2]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.optim import Adam
from accelerate import Accelerator
import wandb
import gc
import torch

In [3]:
from evaluate import load
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# meteor = load('meteor')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [4]:
accelerator = Accelerator()

In [5]:
training_data = pd.read_csv('../Dataset/train.csv')
testing_data = pd.read_csv('../Dataset/test.csv')
validation_data = pd.read_csv('../Dataset/validation.csv')

In [6]:
class DialoGPTDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small", token='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')
        self.input_attributes = ['hatespeech', 'csType']
        self.output_attributes = ['counterspeech']
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f'Hate: {row["hatespeech"]} Type: {row["csType"]}'
        counter_speech = row["counterspeech"]

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=128, truncation=True, padding="max_length")
        counter_speech_ids = self.tokenizer.encode(counter_speech, return_tensors='pt', max_length=128, truncation=True, padding="max_length")

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

In [7]:
train_dataset = DialoGPTDataset(training_data)
test_dataset = DialoGPTDataset(testing_data)
validation_dataset = DialoGPTDataset(validation_data)

print(len(train_dataset))
print(len(test_dataset))
print(len(validation_dataset))

9532
2971
1470


In [8]:
model = AutoModelForCausalLM.from_pretrained("trainer_final_checkpoint")
tokenizer = AutoTokenizer.from_pretrained("trainer_final_checkpoint")

In [9]:
training_args = TrainingArguments(
    output_dir='./trainer_checkpoints',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    num_train_epochs=3,
    fp16=True
)

# Create a new Trainer (without training)
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [10]:
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar
import gc
import torch

# Create DataLoader with smaller batch size
batch_size = 8
dataloader = DataLoader(test_dataset, batch_size=batch_size)

all_predictions = []
all_references = []

model.eval()

# Get the total number of sentences
total_sentences = len(test_dataset)

# Batch inference with memory clearing and progress tracking
with torch.no_grad():
    completed_sentences = 0
    with tqdm(total=total_sentences, desc="Processing Sentences") as pbar:
        for batch in dataloader:
            # Move batch to GPU
            batch = {k: v.to("cuda") for k, v in batch.items()}

            # Perform inference
            outputs = model(**batch)

            # Move tensors back to CPU immediately to free GPU memory
            all_predictions.append(outputs.logits.cpu())
            all_references.append(batch['labels'].cpu())

            # Update progress bar
            batch_size = batch['input_ids'].size(0)
            completed_sentences += batch_size
            pbar.update(batch_size)

            # Force memory release
            del batch
            del outputs
            torch.cuda.empty_cache()
            gc.collect()

# Print completion message
print(f"Inference completed for {completed_sentences} sentences.")




  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Processing Sentences: 100%|█████████████████████████████████████████████████████████████████| 2971/2971 [01:44<00:00, 28.31it/s]

Inference completed for 2971 sentences.





In [12]:
# Concatenate predictions and references
predictions = torch.cat(all_predictions)
references = torch.cat(all_references)

predictions = predictions.squeeze(1)  # Shape: (total_samples, 128)
references = references.squeeze(1)

predictions = predictions.argmax(dim=-1)

# Decode predictions and references
decoded_predictions = test_dataset.tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_references = test_dataset.tokenizer.batch_decode(references, skip_special_tokens=True)

In [14]:
# Calculate ROUGE scores
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(decoded_references, decoded_predictions)]
rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
print(f"ROUGE-1: {rouge1:.4f}")
rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
print(f"ROUGE-2: {rouge2:.4f}")
rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
print(f"ROUGE-L: {rougeL:.4f}")

# Calculate METEOR score
# meteor_score = meteor.compute(predictions=decoded_predictions, references=decoded_references)['meteor']
# print(f"METEOR: {meteor_score:.4f}")

# Calculate BERTScore

P, R, F1 = bert_score(cands=decoded_predictions, refs=decoded_references, lang='en', verbose=True)

# Print evaluation metrics
print(f"BERTScore - P: {P.mean():.4f}, R: {R.mean():.4f}, F1: {F1.mean():.4f}")

ROUGE-1: 0.1297
ROUGE-2: 0.0030
ROUGE-L: 0.1045


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/75 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 9.72 seconds, 305.56 sentences/sec
BERTScore - P: 0.7914, R: 0.8076, F1: 0.7993
