In [1]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.optim import Adam
from accelerate import Accelerator
import wandb

In [2]:
accelerator = Accelerator()

In [3]:
print(torch.__version__)

2.5.1


In [4]:
!wandb login --relogin --verify ''

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/dhruv/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnoel22338[0m ([33mnlp_project_team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
training_data = pd.read_csv('../Dataset/train.csv')
testing_data = pd.read_csv('../Dataset/test.csv')
validation_data = pd.read_csv('../Dataset/validation.csv')

In [6]:
training_data.head()

Unnamed: 0,hatespeech,csType,counterspeech,Suggest,Relevance,Aggressive,Complexity,Comments,source,claim,...,hatespeechTarget,powerDynamics,prompt_offensiveness,prompt_target_group,prompt_speaker_intent,prompt_power_dynamics,prompt_implication,prompt_emotional_reaction,prompt_cognitive_reaction,prompt_cs_generation
0,Maybe the UN could talk to those asian and afr...,Informative,The us is the second most polluting country in...,3.0,4.0,2.0,3.0,,Human,The UN should focus on Asian and African natio...,...,asian_people,racial,Analyze the offensiveness of the statement: Ma...,Identify the group of people that the speaker ...,Analyze the speaker's intention behind writing...,Explain the underlying power dynamics between ...,Explain the implied meaning underlying the off...,Describe how the target group might feel emoti...,Describe how the target group might react cogn...,Analyze the different aspects such as offensiv...
1,Maybe the UN could talk to those asian and afr...,Questioning,Doesn't everyone on the planet have a responsi...,,3.0,2.0,2.0,,Human,The UN should focus on Asian and African natio...,...,asian_people,racial,Analyze the offensiveness of the statement: Ma...,Identify the group of people that the speaker ...,Analyze the speaker's intention behind writing...,Explain the underlying power dynamics between ...,Explain the implied meaning underlying the off...,Describe how the target group might feel emoti...,Describe how the target group might react cogn...,Analyze the different aspects such as offensiv...
2,Maybe the UN could talk to those asian and afr...,Denouncing,The world would be a better place if people we...,1.0,1.0,2.0,2.0,,Human,The UN should focus on Asian and African natio...,...,asian_people,racial,Analyze the offensiveness of the statement: Ma...,Identify the group of people that the speaker ...,Analyze the speaker's intention behind writing...,Explain the underlying power dynamics between ...,Explain the implied meaning underlying the off...,Describe how the target group might feel emoti...,Describe how the target group might react cogn...,Analyze the different aspects such as offensiv...
3,Maybe the UN could talk to those asian and afr...,Positive,"You're right, ocean pollution is one of the ma...",3.0,4.0,2.0,2.0,,Human,The UN should focus on Asian and African natio...,...,asian_people,racial,Analyze the offensiveness of the statement: Ma...,Identify the group of people that the speaker ...,Analyze the speaker's intention behind writing...,Explain the underlying power dynamics between ...,Explain the implied meaning underlying the off...,Describe how the target group might feel emoti...,Describe how the target group might react cogn...,Analyze the different aspects such as offensiv...
4,Juice Jews are worse that nukes. Caption to a ...,Informative,Anti-semitism is a serious problem that we nee...,2.0,3.0,2.0,2.0,,Human,Jews are worse than nukes.,...,jews,religion,Analyze the offensiveness of the statement: Ju...,Identify the group of people that the speaker ...,Analyze the speaker's intention behind writing...,Explain the underlying power dynamics between ...,Explain the implied meaning underlying the off...,Describe how the target group might feel emoti...,Describe how the target group might react cogn...,Analyze the different aspects such as offensiv...


In [7]:
columns = training_data.columns
print(columns)

Index(['hatespeech', 'csType', 'counterspeech', 'Suggest', 'Relevance',
       'Aggressive', 'Complexity', 'Comments', 'source', 'claim',
       'centralTopic', 'speakerIntent', 'targetGroup', 'relevantPowerDynamics',
       'hatespeechImplication', 'targetGroupEmotionalReaction',
       'targetGroupCognitiveReaction', 'hatespeechOffensiveness', 'id',
       'is_high_quality', 'hs_id', 'hatespeechTarget', 'powerDynamics',
       'prompt_offensiveness', 'prompt_target_group', 'prompt_speaker_intent',
       'prompt_power_dynamics', 'prompt_implication',
       'prompt_emotional_reaction', 'prompt_cognitive_reaction',
       'prompt_cs_generation'],
      dtype='object')


In [8]:
class DialoGPTDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", token='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')
        self.input_attributes = ['hatespeech', 'csType']
        self.output_attributes = ['counterspeech']
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f'Hate: {row["hatespeech"]} Type: {row["csType"]}'
        counter_speech = row["counterspeech"]

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=128, truncation=True, padding="max_length")
        counter_speech_ids = self.tokenizer.encode(counter_speech, return_tensors='pt', max_length=128, truncation=True, padding="max_length")

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

In [9]:
train_dataset = DialoGPTDataset(training_data)
test_dataset = DialoGPTDataset(testing_data)
validation_dataset = DialoGPTDataset(validation_data)

print(len(train_dataset))
print(len(test_dataset))
print(len(validation_dataset))

print(train_dataset[0])

9532
2971
1470
{'input_ids': tensor([[   39,   378,    25,  6674,   262,  4725,   714,  1561,   284,   883,
           355,   666,   290,  6580, 37189,  7027,  4497,   329,  4101,    10,
           286,   262, 12231,   287,   262, 23744,  2427,   286, 23630,   319,
           428, 20041,   546,  4258,  1487,    13,  5994,    25, 45255,   876,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 

  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)


In [10]:
dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium", use_auth_token='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')
optimizer = Adam(dialogpt_model.parameters(), lr=1e-5)



In [11]:
training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=4,
    learning_rate=1e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    output_dir='./dialogpt_logs'
)

In [12]:
trainer = Trainer(
    model=dialogpt_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    optimizers=(optimizer, None)
)

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnoel22338[0m ([33mnlp_project_team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.4024
20,3.0556
30,3.1703
40,2.7622
50,2.5903
60,2.0962
70,2.6438
80,3.0425
90,3.0666
100,3.2404


  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
  'input_ids': torch.tensor(input_ids, dtype=torch.long),
  'labels': torch.tensor(counter_speech_ids, dtype=torch.long)


In [None]:
results = trainer.evaluate(test_dataset)
print(results)