# **Fine-Tuning GPT-2 for Counter Speech Generation** 

# Set Up

In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!nvidia-smi

Tue May 16 19:16:05 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     8W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# wrap outputs cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tweet-preprocessor
!pip install accelerate
!pip install optuna

In [4]:
import os
import pandas as pd
import sys
from datasets import Dataset
from transformers import AutoConfig, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, AdamW, get_cosine_schedule_with_warmup

In [5]:
# Data Path
root_dir = "gdrive/My Drive/Master_Thesis/"
train_dir = os.path.join(root_dir, 'data/Custom/CONAN_train.csv')

# Define model name
model_name = "gpt2-medium"

# this is for saving models later
my_model_name = "gpt2_CONAN"
save_directory = os.path.join(root_dir, 'models/')

In [6]:
training_args = TrainingArguments(
    output_dir=my_model_name,
    num_train_epochs=10.0,
    learning_rate=3.800568576836524e-05,
    weight_decay=0.050977894796868116,
    warmup_ratio=0.10816909354342182,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    auto_find_batch_size=True,
    report_to="none",
    logging_dir=root_dir+"logs",
)
  
def optuna_hp_space(trial):
  return {
      "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
      "warmup_ratio": trial.suggest_float("warmup_ratio", 0.1, 0.3, log=True),
      #"num_train_epochs":trial.suggest_int('num_train_epochs', low = 3, high = 8),
      #"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
      "weight_decay":trial.suggest_float('weight_decay', 0.01, 0.3),
      }

# Data

For our training, we have a few corpora to work with.
1. the CONAN datasets
2. the QIAN Benchmark dataset

---

For fine-tuning GPT models (Causal Language Modeling), we concatenate the hate speech and the counter speech in one string with each begins with their prefix:
### **Hate-speech: "Text" Counter-speech: "Text"**


In [7]:
def concat_hs_cs(df):
  # concatenate hate-speech and counter-speech
  df["text"] = '<|endoftext|>' + "Hate-speech: " + df['Hate_Speech'] + " " + "Counter-speech: " + df["Counter_Speech"] + '<|endoftext|>'

  return df

In [8]:
# Read csv file into dataframe
df = pd.read_csv(train_dir)
df = concat_hs_cs(df)

### Convert **Dataframe** to Huggingface **Dataset**

In [9]:
# train validation split
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2) # since we already have a seperate test set, we use the [test] split to run validation

### Tokenizer and Model

In [10]:
# initiate pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name) 
config = AutoConfig.from_pretrained(
    model_name
    )  
AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    )

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

## Data Pre-processing

preparation for batching - tokenize data and chunking into blocks

In [11]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

def group_texts(examples):
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {
      k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
      }
  result["labels"] = result["input_ids"].copy()
  return result

In [12]:
# tokenize dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

# Chunking texts for batching
block_size = 512
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

# prepare tokenizer for data pre-processing
tokenizer.pad_token = tokenizer.eos_token

# set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

# Hyperparameter Tuning

In [None]:
def model_init(trial):
  return AutoModelForCausalLM.from_pretrained(
      model_name,
      from_tf=bool(".ckpt" in model_name),
      config=config,
      )

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=10,
)

In [None]:
best_trial

BestRun(run_id='0', objective=1.4545097351074219, hyperparameters={'learning_rate': 3.800568576836524e-05, 'warmup_ratio': 0.10816909354342182, 'weight_decay': 0.050977894796868116}, run_summary=None)

In [None]:
best_p = best_trial.hyperparameters
setattr(trainer.args, 'learning_rate', best_p['learning_rate'])
setattr(trainer.args, 'weight_decay', best_p['weight_decay'])
setattr(trainer.args, 'warmup_ratio', best_p['warmup_ratio'])

# Training

In [13]:
model = AutoModelForCausalLM.from_pretrained(model_name)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)
setattr(trainer.args, 'num_train_epochs', 20)

trainer.train()

In [None]:
# remove saved checkpoints
!rm -rf {my_model_name}

### Quick Evaluation for sanity check

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 4.58


## Save Model

In [None]:
# import utilities
sys.path.append(os.path.join("/content/", root_dir))

from utilities import save_model

In [None]:
# Save model option
save_model(tokenizer, model, save_directory, my_model_name, save_option=True)