# Adverserial Training
This is our core work. We adversarially train with three different strategies:
1. Adversarial Training with Custom Attack from Pre-Trained Model
2. Adversarial Finetuning with Custom Attack from the Initial Hate Speech Model
3. Adversarial Training with Textfooler Attack from Pre-Trained Model

In the file 'adversarial_attacks.ipynb' these trained hate speech models will be attacked to see if we can achieve any improvements.

Following naming will be used below:
- <strong>Pre-Trained Model:</strong> This is the [RoBERTa model ](https://huggingface.co/docs/transformers/model_doc/roberta) model from Huggingface
- <strong>Initial Hate Speech Model:</strong> This is our RoBERTa model, which we trained on the Hate speech data set.
- <strong>Trained Hate Speech Model:</strong> RoBERTa model, which was trained using adversarial training
    - <strong>Custom Trained:</strong> RoBERTa model, which was trained using the custom attack
    - <strong>Custom Finetuned:</strong> Initial Hate Speech Model, which was finetuned using the custom attack
    - <strong>Textfooler Trained:</strong> RoBERTa model, which was trained using the Textfooler attack


## Install

In [None]:
# The kernel needs to be restarted after the pip installs
# It appears that textattack has dependency conflicts if not run on an ARM chip.
!pip3 install transformers
!pip3 install textattack
!pip3 install --force-reinstall textattack # force is often needed due to dependency conflicts
!pip3 install --upgrade tensorflow
!pip3 install sentence_transformers

In [None]:
# This script can help if the nltk downloads fail due to certificate problems.
# copied from https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

## Import

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.wsd import lesk
nltk.download('stopwords')
nltk.download('punkt')

# textattack packages
import textattack
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance

# transformers packages
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import RobertaTokenizer, RobertaForSequenceClassification


from trainer import Trainer

## Load Dataset


In [None]:
# Preprocessing
#this is copy from https://www.kaggle.com/code/soumyakushwaha/ethicalcommunicationai
# ----------------------------------------
stopword = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r"\@w+|\#",'',text)
    text = re.sub(r"[^\w\s]",'',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tweet_tokens = word_tokenize(text)
    filtered_tweets=[w for w in tweet_tokens if not w in stopword] #removing stopwords
    return " ".join(filtered_tweets)
#--------------------------------------------------------------------------------------

In [None]:
# Constants
SEED = 42
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
MAX_TEXT_LENGTH = 512
EPOCHS = 10
MODEL_PATH = 'roberta_model.bin'
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Set seeds
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


labeled_data = pd.read_csv('./datasets/hate_speech_data.csv')
# Hate Speech and Offensive Language Data: 25.3k total entries.
# - Class 0: 1,430 entries (hate speech)
# - Class 1: 19,190 entries (offensive language)
# - Class 2: 4,163 entries (neither)

# Processing labeled hate speech dataset
hate_offensive_data = labeled_data[labeled_data['class'] != 2].copy()
hate_offensive_data.loc[:, 'category'] = hate_offensive_data['class'].replace([0, 1], 1)
hate_offensive_data = hate_offensive_data.rename(columns={'tweet': 'text'})

# Test 1 ---
# Select data for each class
hate_speech_data = labeled_data[labeled_data['class'] == 0].copy()
offensive_data = labeled_data[labeled_data['class'] == 1].copy()
neither_data = labeled_data[labeled_data['class'] == 2].copy()
sample_size = len(hate_speech_data)
offensive_sample = offensive_data.sample(n=sample_size, random_state=SEED)
neither_sample = neither_data.sample(n=sample_size, random_state=SEED)
hate_speech_data['category'] = 1
offensive_sample['category'] = 1
neither_sample['category'] = 0
sampled_data = pd.concat([hate_speech_data, offensive_sample, neither_sample], ignore_index=True)[['tweet', 'category']]
sampled_data.rename(columns={'tweet': 'text', 'category': 'label'}, inplace=True)
sampled_data['text'] = sampled_data['text'].apply(clean_text)  # Assuming clean_text is a defined function
train_data, intermediate_data = train_test_split(sampled_data, test_size=0.3, random_state=SEED)
validation_data, test_data = train_test_split(intermediate_data, test_size=0.5, random_state=SEED)
train_tokens = tokenizer(train_data['text'].tolist(), padding=True, truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors='pt')
validation_tokens = tokenizer(validation_data['text'].tolist(), padding=True, truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors='pt')
test_tokens = tokenizer(test_data['text'].tolist(), padding=True, truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors='pt')
print(f"New Train data shape: {train_data.shape}")
print(f"New Validation data shape: {validation_data.shape}")
print(f"New Test data shape: {test_data.shape}")


#### Function to load models

In [None]:
def load_model(file_name):
    config = RobertaConfig()
    config.num_labels = 2
    roberta_base_config = {
      "architectures": [
        "RobertaForMaskedLM"
      ],
      "attention_probs_dropout_prob": 0.1,
      "bos_token_id": 0,
      "eos_token_id": 2,
      "hidden_act": "gelu",
      "hidden_dropout_prob": 0.1,
      "hidden_size": 768,
      "initializer_range": 0.02,
      "intermediate_size": 3072,
      "layer_norm_eps": 1e-05,
      "max_position_embeddings": 514,
      "model_type": "roberta",
      "num_attention_heads": 12,
      "num_hidden_layers": 12,
      "pad_token_id": 1,
      "type_vocab_size": 1,
      "vocab_size": 50265
    }

    for key in roberta_base_config.keys():
        setattr(config, key, roberta_base_config[key])

    model = RobertaForSequenceClassification(config)
    map_location=torch.device('cpu')
    model.load_state_dict(torch.load(file_name, map_location=map_location))
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model.eval()
    model.to(map_location)
    return model, tokenizer

## Attack Setup
Now we prepare the following two attacks for training:

- a custom attack
- textfooler attack from textattack


### Custom Attack

In [None]:
ATTACK_SEED = 71

def create_custom_attack(model_wrapper):

    # Define custom attack based on https://textattack.readthedocs.io/en/latest/api/attack.html used for training loop

    #UntagetedClassification: An untargeted attack on classification models which attempts
    #to minimize the score of the correct label until it is no longer the predicted label.
    goal_function = textattack.goal_functions.UntargetedClassification(model_wrapper)

    constraints = [
        RepeatModification(), # prevents the same word from being modified multiple times
        StopwordModification(), # controls the modification of stopwords (e.g., "the," "is," "and")
        WordEmbeddingDistance(min_cos_sim=0.9), # measures the cosine similarity between word embeddings to ensure that the replacement word is semantically similar
    ]

    transformation = textattack.transformations.word_swaps.word_swap_embedding.WordSwapEmbedding(max_candidates=50) # (50 is default)
    search_method = textattack.search_methods.GreedyWordSwapWIR(wir_method="delete")
    custom_attack = textattack.Attack(goal_function, constraints, transformation, search_method) # define the attack

    return custom_attack


#### Textfooler Attack from testattack

In [None]:
# Use the Bert-attack from textattack based on https://textattack.readthedocs.io/en/latest/3recipes/attack_recipes.html#bert-attack

def create_textfooler_attack(model_wrapper):

    textfooler_attack = textattack.attack_recipes.textfooler_jin_2019.TextFoolerJin2019.build(model_wrapper) # perform the attack
    return textfooler_attack

## Train Model on the Attacked Data
We now use the attacked data to train our model again. For the training we use the trainer of the textattack library.
First we setup the evaluation and training dataset as well as the training arguments.

In [None]:
## Define training base on https://textattack.readthedocs.io/en/latest/api/trainer.html
temp = list(validation_data.itertuples(index=False, name=None))
eval_dataset = textattack.datasets.Dataset(temp)

temp_train = list(train_data.itertuples(index=False, name=None))
train_dataset = textattack.datasets.Dataset(temp_train)
training_args = textattack.TrainingArgs(
    num_epochs=3,
    num_clean_epochs=1,
    num_train_adv_examples=1000,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    log_to_tb=True,
)

#### Run Custom Attack Trainer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
pretrained_roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
pretrained_roberta_model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(pretrained_roberta_model, tokenizer)

custom_attack = create_custom_attack(pretrained_roberta_model_wrapper)

custom_attack_trainer = Trainer(
    pretrained_roberta_model_wrapper,
    "classification",
    custom_attack,
    train_dataset,
    eval_dataset,
    training_args
)
custom_attack_trainer.train()

custom_attack_trainer.evaluate()

#### Run Custom Attack Trainer to finetune Initial Hate Speech Model

The model was trained according to 'inital_hate_speech_model_training.ipynb' and is reloaded to be finetuned with attacked data.

In [None]:
initial_hate_speech_model = load_model('initial_hate_speech_model.bin')
initial_hate_speech_model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(initial_hate_speech_model, tokenizer)

custom_attack = create_custom_attack(initial_hate_speech_model_wrapper)

custom_attack_trainer_finetune = Trainer(
    initial_hate_speech_model_wrapper,
    "classification",
    custom_attack,
    train_dataset,
    eval_dataset,
    training_args
)
custom_attack_trainer_finetune.train()

custom_attack_trainer_finetune.evaluate()

#### Run textfooler Attack Trainer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
pretrained_roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
pretrained_roberta_model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(pretrained_roberta_model, tokenizer)

textfooler_attack = create_textfooler_attack(pretrained_roberta_model_wrapper)


textfooler_attack_trainer = textattack.Trainer(
    pretrained_roberta_model_wrapper,
    "classification",
    textfooler_attack,
    train_dataset,
    eval_dataset,
    training_args
)
textfooler_attack_trainer.train()

textfooler_attack_trainer.evaluate()