In [None]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.optim import Adam

In [None]:
training_data = pd.read_csv('../Dataset/train.csv')
testing_data = pd.read_csv('../Dataset/test.csv')
validation_data = pd.read_csv('../Dataset/validation.csv')

In [None]:
training_data.head()

In [None]:
columns = training_data.columns
print(columns)

In [None]:
class DialoGPTDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", use_auth_token='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')
        self.input_attributes = ['hatespeech', 'csType']
        self.output_attributes = ['counterspeech']
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f'Hate: {row["hatespeech"]} Type: {row["csType"]}'
        counter_speech = row["counterspeech"]

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=1024, truncation=True, padding="max_length")
        counter_speech_ids = self.tokenizer.encode(counter_speech, return_tensors='pt', max_length=1024, truncation=True, padding="max_length")

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(counter_speech_ids, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

In [None]:
train_dataset = DialoGPTDataset(training_data)
test_dataset = DialoGPTDataset(testing_data)
validation_dataset = DialoGPTDataset(validation_data)

print(len(train_dataset))
print(len(test_dataset))
print(len(validation_dataset))

print(train_dataset[0])

In [None]:
dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium", use_auth_token='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')
optimizer = Adam(dialogpt_model.parameters(), lr=1e-5)

In [None]:
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=1e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model=dialogpt_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    optimizers=(optimizer, None)
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate(test_dataset)
print(results)