In [None]:
import json
import os
import random
import re

import numpy as np
import pandas as pd
import spacy
import torch
from datasets import Dataset, DataCollatorWithPadding, concatenate_datasets, load_dataset
from nltk import download
from nltk.corpus import stopwords, wordnet
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from scipy.special import softmax
from tqdm import tqdm
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          EarlyStoppingCallback, Trainer, TrainingArguments)

download('wordnet')
download('omw-1.4')
download('stopwords')


# Hate Speech and Counter Speech prediction models

## Hate Speech

In [None]:
dataset = load_dataset('csv', data_files='...')
labels = dataset['train']['label']  # Assuming 'label' is the column name for labels

# In this step you should also preprocess your data as desired

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, temp_indices = next(splitter.split(dataset['train'], labels))
train_set = dataset['train'].select(train_indices)
temp_set = dataset['train'].select(temp_indices)

temp_labels = temp_set['label']
splitter_temp = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_gstate=42)
validation_indices, test_indices = next(splitter_temp.split(temp_set, temp_labels))
validation_set = temp_set.select(validation_indices)
test_set = temp_set.select(test_indices)

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")
model = AutoModelForSequenceClassification.from_pretrained("GroNLP/hateBERT", num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
tokenized_train = train_set.map(tokenize_function, batched=True)
tokenized_validation = validation_set.map(tokenize_function, batched=True)
tokenized_test = test_set.map(tokenize_function, batched=True)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

## Counter Speech

In [None]:
def prepare_data(df):
    pairs, labels = [], []
    for i in range(0, len(df) - 1, 2):
        combined_text = "[CLS] " + replace_emojis(df.iloc[i]['text']) + " [SEP] " + replace_emojis(df.iloc[i+1]['text'])
        combined_label = [df.iloc[i]['label'], df.iloc[i+1]['label']]
        pairs.append(combined_text)
        labels.append(combined_label)
    return pd.DataFrame({'text': pairs, 'label': labels})


def split_data(df):
    X = df['text']
    y = pd.DataFrame(df['label'].tolist(), columns=['label_1', 'label_2'])
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Load your dataset with pairs of sentences that constitute the dialogues
subdialogues_df = load_data("...")
pairs_df = prepare_data(subdialogues_df) #In this step also apply any desired pre-processing

X_train, X_test, y_train, y_test = split_data(pairs_df)

In [None]:
# Model and Training
class DialogueDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        labels = self.labels.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(labels.values, dtype=torch.float)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = DialogueDataset(X_train, y_train, tokenizer)
test_dataset = DialogueDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()