# **Fine-Tuning GPT-2 for Counter Speech Generation** 

# Set Up

In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!nvidia-smi

Sat Jun 10 20:58:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# wrap outputs cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [4]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tweet-preprocessor
!pip install accelerate
!pip install optuna

In [5]:
import os
import pandas as pd
import sys
from datasets import Dataset
import numpy as np
from transformers import AutoConfig, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, AdamW, get_cosine_schedule_with_warmup

In [6]:
# Data Path
root_dir = "gdrive/My Drive/Master_Thesis/"
train_dir = os.path.join(root_dir, 'data/Custom/CONAN_train.csv')

# Define model name
model_name = "gpt2-medium"

# this is for saving models later
my_model_name = "gpt2_CONAN"
save_directory = os.path.join(root_dir, 'models/')

In [7]:
training_args = TrainingArguments(
    output_dir=my_model_name,
    num_train_epochs=10.0,
    learning_rate=3.800568576836524e-05,
    weight_decay=0.050977894796868116,
    warmup_ratio=0.10816909354342182,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    auto_find_batch_size=True,
    report_to="none",
    logging_dir=root_dir+"logs",
)
  
def optuna_hp_space(trial):
  return {
      "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
      "warmup_ratio": trial.suggest_float("warmup_ratio", 0.1, 0.3, log=True),
      #"num_train_epochs":trial.suggest_int('num_train_epochs', low = 3, high = 8),
      #"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
      "weight_decay":trial.suggest_float('weight_decay', 0.01, 0.3),
      }

# Data

For our training, we have a few corpora to work with.
1. the CONAN datasets
2. the QIAN Benchmark dataset

---

For fine-tuning GPT models (Causal Language Modeling), we concatenate the hate speech and the counter speech in one string with each begins with their prefix:
### **Hate-speech: "Text" Counter-speech: "Text"**


In [116]:
def concat_hs_cs(df):
  # concatenate hate-speech and counter-speech
  ### TODO: Concate category tokens to the inputs
  df["text"] = '<|endoftext|>' + df["Target"] + "Hate-speech: " + df['Hate_Speech'] + " " + "Counter-speech: " + df["Counter_Speech"] + '<|endoftext|>'

  return df

def extract_labels(df, targets):
  df_test = df.copy()
  types = targets
  def format_multiclass(item):
    return item.split("/")
  
  def format_multiclass_2(item):
    if item != item:
      return []
    return item.upper().split(",")

  def format_class(item):
    for ele in item:
      if ele.strip() in types:
        continue
      else:
        item.remove(ele)
    return item

  def format_tokenize(item):
    temp = ''
    for ele in item:
      temp = temp + '<' + ele.strip() + '>'
    return temp

  df_test["Target"] = np.where(df_test["Target"] == "Islamophobia", "MUSLIMS", df_test["Target"])
  df_test["Target"] = df_test["Target"].map(format_multiclass)
  df_test["Target_2"] = df_test["Target_2"].map(format_multiclass_2)
  df_test["Target"] = df_test["Target"] + df_test["Target_2"]

  df_test["Target"] = df_test["Target"].map(format_class)
  df_test["Target"] = df_test["Target"].map(format_class)
  df_test["Target"] = df_test["Target"].map(format_tokenize)

  return df_test

In [125]:
# Read csv file into dataframe
df = pd.read_csv(train_dir)
types = ["MIGRANTS", "POC", "LGBT+", "MUSLIMS", "WOMEN", "JEWS", "other", "DISABLED"]
df = extract_labels(df, types)
df = concat_hs_cs(df)

### Convert **Dataframe** to Huggingface **Dataset**

In [128]:
# train validation split
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2) # since we already have a seperate test set, we use the [test] split to run validation

### Tokenizer and Model

In [129]:
# initiate pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name) 
config = AutoConfig.from_pretrained(
    model_name
    )  
AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    )

model = AutoModelForCausalLM.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [130]:
# Add new tokens to tokenizer

# new tokens
new_tokens = []
for target in types:
  new_tokens.append('<' + target + '>')

# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 1024)

## Data Pre-processing

preparation for batching - tokenize data and chunking into blocks

In [131]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

def group_texts(examples):
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {
      k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
      }
  result["labels"] = result["input_ids"].copy()
  return result

In [132]:
# tokenize dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

# Chunking texts for batching
block_size = 512
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

# prepare tokenizer for data pre-processing
tokenizer.pad_token = tokenizer.eos_token

# set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

# Training

In [133]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)
setattr(trainer.args, 'num_train_epochs', 20)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,2.096154
2,6.453600,1.803846
3,1.829000,1.655716
4,1.506200,1.618213
5,1.336200,1.612391
6,1.297100,1.614896
7,1.285900,1.585366
8,1.285900,1.538993
9,1.233100,1.499188
10,1.105900,1.459211


Epoch,Training Loss,Validation Loss
1,No log,2.096154
2,6.453600,1.803846
3,1.829000,1.655716
4,1.506200,1.618213
5,1.336200,1.612391
6,1.297100,1.614896
7,1.285900,1.585366
8,1.285900,1.538993
9,1.233100,1.499188
10,1.105900,1.459211


TrainOutput(global_step=8660, training_loss=1.2712722073526361, metrics={'train_runtime': 9698.0366, 'train_samples_per_second': 1.786, 'train_steps_per_second': 0.893, 'total_flos': 1.6088810832003072e+16, 'train_loss': 1.2712722073526361, 'epoch': 20.0})

In [134]:
# remove saved checkpoints
!rm -rf {my_model_name}

### Quick Evaluation for sanity check

In [135]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 4.30


## Save Model

In [None]:
# import utilities
sys.path.append(os.path.join("/content/", root_dir))

from utilities import save_model

In [138]:
# Save model option
save_model(tokenizer, model, save_directory, my_model_name, save_option=True)