# **Fine-Tuning BART for Counter Speech Generation** 

# Set Up

In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# wrap outputs cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install tweet-preprocessor
!pip install accelerate
!pip install optuna

In [4]:
import os
import pandas as pd
import sys
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AdamW, get_cosine_schedule_with_warmup, AutoConfig

In [5]:
# Data Path
root_dir = "gdrive/My Drive/Master_Thesis/"
train_dir = os.path.join(root_dir, 'data/Custom/CONAN_train.csv')

# Define model name
# model_name = 'facebook/bart-base'
model_name = 'facebook/bart-large'


# this is for saving models later
my_model_name = "bart_CONAN"
save_directory = os.path.join(root_dir, 'models/')

In [6]:
# Set Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_name,
    num_train_epochs=10.0,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.2,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    auto_find_batch_size=True,
)

def optuna_hp_space(trial):
  return {
      "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
      "warmup_ratio": trial.suggest_float("warmup_ratio", 0.1, 0.3, log=True),
      #"num_train_epochs":trial.suggest_int('num_train_epochs', low = 3, high = 8),
      #"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
      "weight_decay":trial.suggest_float('weight_decay', 0.01, 0.3),
      }

# Data

For our training, we have a few corpora to work with.
1. the CONAN datasets
2. the QIAN Benchmark dataset

---

For fine-tuning GPT models (Causal Language Modeling), we concatenate the hate speech and the counter speech in one string with each begins with their prefix:
### **Hate-speech: "Text" Counter-speech: "Text"**


In [7]:
# Read csv file into dataframe
df = pd.read_csv(train_dir)

### Convert **Dataframe** to Huggingface **Dataset**

In [8]:
# train validation split
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2) # since we already have a seperate test set, we use the [test] split to run validation

### Tokenizer and Model

In [9]:
# initiate pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name) 
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=AutoModelForSeq2SeqLM.from_pretrained(model_name,config=config,))

AutoModelForSeq2SeqLM.from_pretrained(model_name,config=config,)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

## Data Pre-processing

preparation for batching - tokenize data and chunking into blocks

In [10]:
def preprocess_function(examples):
  # get hate speech as input
  inputs = [hatespeech for hatespeech in examples['Hate_Speech']]
  targets = [counterspeech for counterspeech in examples['Counter_Speech']]
  #tokenize the dialogues
  model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
  return model_inputs

In [11]:
# tokenize dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

# Hyperparameter Tuning

In [None]:
def model_init(trial):
  return AutoModelForSeq2SeqLM.from_pretrained(
      model_name,
      from_tf=bool(".ckpt" in model_name),
      config=config,
      )

In [None]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="minimize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=10,
)

[32m[I 2023-05-18 04:57:24,759][0m A new study created in memory with name: no-name-969729d9-19ad-466c-916f-5b40cc7c04cb[0m
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,5.246,2.708601
2,2.9102,2.593333
3,2.7244,2.517387
4,2.5838,2.465034
5,2.4648,2.420606
6,2.3976,2.397617
7,2.3457,2.379739
8,2.2979,2.369459
9,2.2847,2.366482
10,2.2803,2.367055


[32m[I 2023-05-18 05:58:07,278][0m Trial 0 finished with value: 2.3670547008514404 and parameters: {'learning_rate': 3.5755524504506446e-06, 'warmup_ratio': 0.1939669890374079, 'weight_decay': 0.012205597300066309}. Best is trial 0 with value: 2.3670547008514404.[0m


Epoch,Training Loss,Validation Loss
1,3.6715,2.5601
2,2.5963,2.412105
3,2.2127,2.25065
4,1.787,2.102692
5,1.3589,2.016563
6,1.0326,1.988066
7,0.7787,2.076373
8,0.5897,2.146431
9,0.4771,2.216847
10,0.4301,2.237401


[32m[I 2023-05-18 06:58:44,182][0m Trial 1 finished with value: 2.237401247024536 and parameters: {'learning_rate': 5.030639760950113e-05, 'warmup_ratio': 0.26263705636584506, 'weight_decay': 0.03969533958199956}. Best is trial 1 with value: 2.237401247024536.[0m


Epoch,Training Loss,Validation Loss
1,4.4254,2.648682
2,2.7945,2.514085
3,2.5564,2.428645
4,2.3796,2.364466
5,2.2214,2.309668
6,2.1265,2.279738
7,2.0495,2.256585
8,1.9898,2.245041
9,1.9625,2.244636
10,1.9556,2.24559


[32m[I 2023-05-18 08:00:36,714][0m Trial 2 finished with value: 2.2455902099609375 and parameters: {'learning_rate': 6.607897771260128e-06, 'warmup_ratio': 0.14391961284153518, 'weight_decay': 0.04161970433478308}. Best is trial 1 with value: 2.237401247024536.[0m


Epoch,Training Loss,Validation Loss
1,5.2931,2.717038
2,2.9277,2.62157
3,2.7925,2.576748
4,2.7043,2.539963
5,2.6284,2.514612
6,2.5937,2.499728
7,2.5706,2.495577
8,2.5424,2.486003
9,2.5421,2.483348
10,2.5427,2.483135


[32m[I 2023-05-18 09:01:19,549][0m Trial 3 finished with value: 2.4831349849700928 and parameters: {'learning_rate': 1.7159209823698023e-06, 'warmup_ratio': 0.10510918237791124, 'weight_decay': 0.10874756703074243}. Best is trial 1 with value: 2.237401247024536.[0m


Epoch,Training Loss,Validation Loss
1,4.1979,2.625213
2,2.7455,2.488663
3,2.4755,2.387891
4,2.2713,2.322192
5,2.0871,2.252231
6,1.9728,2.216737
7,1.879,2.193183
8,1.8092,2.179631
9,1.7719,2.18145
10,1.7634,2.182661


[32m[I 2023-05-18 10:01:54,382][0m Trial 4 finished with value: 2.1826605796813965 and parameters: {'learning_rate': 8.777595074568718e-06, 'warmup_ratio': 0.13348459159449302, 'weight_decay': 0.016677659015418556}. Best is trial 4 with value: 2.1826605796813965.[0m


Epoch,Training Loss,Validation Loss
1,3.6639,2.563744
2,2.5861,2.40605
3,2.1446,2.206249
4,1.7874,2.096989
5,1.4762,2.020329
6,1.2515,1.984923
7,1.0868,2.005079
8,0.9576,2.013144
9,0.8863,2.041623
10,0.8601,2.049186


[32m[I 2023-05-18 11:02:48,170][0m Trial 5 finished with value: 2.0491859912872314 and parameters: {'learning_rate': 2.5574655007629766e-05, 'warmup_ratio': 0.1321825827978752, 'weight_decay': 0.09065495627977234}. Best is trial 5 with value: 2.0491859912872314.[0m


Epoch,Training Loss,Validation Loss
1,4.9499,2.695854


[32m[I 2023-05-18 11:08:31,902][0m Trial 6 pruned. [0m


Epoch,Training Loss,Validation Loss
1,5.1817,2.710953


[32m[I 2023-05-18 11:14:10,273][0m Trial 7 pruned. [0m


Epoch,Training Loss,Validation Loss
1,3.3681,2.641568


[32m[I 2023-05-18 11:19:49,301][0m Trial 8 pruned. [0m


Epoch,Training Loss,Validation Loss
1,4.3902,2.644359


[32m[I 2023-05-18 11:25:29,438][0m Trial 9 pruned. [0m


In [None]:
best_trial

BestRun(run_id='5', objective=2.0491859912872314, hyperparameters={'learning_rate': 2.5574655007629766e-05, 'warmup_ratio': 0.1321825827978752, 'weight_decay': 0.09065495627977234}, run_summary=None)

In [None]:
best_p = best_trial.hyperparameters
setattr(trainer.args, 'learning_rate', best_p['learning_rate'])
setattr(trainer.args, 'weight_decay', best_p['weight_decay'])
setattr(trainer.args, 'warmup_ratio', best_p['warmup_ratio'])

# Training

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

setattr(trainer.args, 'learning_rate', 2.5574655007629766e-05)
setattr(trainer.args, 'weight_decay', 0.09065495627977234)
setattr(trainer.args, 'warmup_ratio', 0.1321825827978752)
setattr(trainer.args, 'num_train_epochs', 20)

trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,4.0472,2.618528
2,2.6842,2.437178
3,2.3598,2.286148
4,1.994,2.133574
5,1.6792,2.059788
6,1.3893,2.002633
7,1.1667,1.990357
8,0.9969,2.021491
9,0.8401,2.085091
10,0.7061,2.075227


TrainOutput(global_step=19080, training_loss=1.0488255352843983, metrics={'train_runtime': 7093.8589, 'train_samples_per_second': 21.517, 'train_steps_per_second': 2.69, 'total_flos': 1.2615376178577408e+16, 'train_loss': 1.0488255352843983, 'epoch': 20.0})

In [13]:
# remove saved checkpoints
!rm -rf {my_model_name}

### Quick Evaluation for sanity check

In [14]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.32


## Save Model

In [15]:
# import utilities
sys.path.append(os.path.join("/content/", root_dir))

from utilities import save_model

In [16]:
# Save model option
save_model(tokenizer, model, save_directory, my_model_name, save_option=True)