# **Fine-Tuning GPT-2 for Counter Speech Generation** 

# Set Up

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# wrap outputs cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Install Libraries
%%capture
!pip install transformers
!pip install transformers[sentencepiece]
!pip install datasets
!pip install tweet-preprocessor
!pip install accelerate
!pip install ekphrasis

In [None]:
import os
import pandas as pd
import sys
import torch
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
# Path
root_dir = "gdrive/My Drive/CounterGEDI/"

# Set Training arguments
params={
    'save_path':os.path.join(root_dir,'MODELs/Saved_models/Generator/'),
    'model_path':'microsoft/DialoGPT-medium',
    'cache_path':os.path.join(root_dir,'Cache/Saved_models/'),
    'task_name':'CONAN',	# Task name -> name of the task for which model needs to be trained, takes values like: CONAN, Reddit, Gab
    'topic': True,
    'max_length': 256,
    'train': True,
    'batch_size':4,
    'gradient_accumulation_steps':1,
    'learning_rate':5e-6,
    'weight_decay':0.0,
    'adam_epsilon':1e-8,
    'max_grad_norm':1.0,
    'num_train_epochs':50,
    'max_steps':-1,
    'warmup_steps':0,
    'seed':42,
    'device':'cuda',
    'logging':'local',
    'freeze_layer_count':0,
    'block_size':512
}

In [None]:
training_args = TrainingArguments(
    output_dir=params['save_path'],
    num_train_epochs=params['num_train_epochs'],
    learning_rate=params['learning_rate'],
    weight_decay=params['weight_decay'],
    warmup_ratio=0.2,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    auto_find_batch_size=True,
)

device = "cuda:0" if torch.cuda.is_available() and params['device']=='cuda' else "cpu"

### Tokenizer and Model

In [None]:
# initiate pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(params['model_path'])
model = AutoModelForCausalLM.from_pretrained(params['model_path']).to(device)

# set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## Data Pre-processing

preparation for batching - tokenize data and chunking into blocks

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time'],

    # terms that will be annotated
    fix_html=True,  # fix HTML tokens

    annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},

    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    #corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons])

def load_data_own_gen(data_path='Davidson'):
  '''data_path: The folder path of the dataset in csv
  '''
  df_train = pd.read_csv(data_path+'Train.csv')
  df_train = clean(df_train)
  df_val = pd.read_csv(data_path+'Val.csv')
  df_val = clean(df_val)
  df_test = pd.read_csv(data_path+'Test.csv')
  df_test = clean(df_test)
  
  return df_train, df_val, df_test

def clean(df):
  df["Hate_Speech"].map(clean_text)
  df["Counter_Speech"].map(clean_text)
  df["text"] = df['Hate_Speech'] + tokenizer.eos_token + df["Counter_Speech"] + tokenizer.eos_token

  return df


def clean_text(text):
  remove_words=['<allcaps>','</allcaps>','<hashtag>','</hashtag>','<elongated>','<emphasis>','<repeated>','\'','s']
  
  word_list=text_processor.pre_process_doc(text)
  word_list=list(filter(lambda a: a not in remove_words, word_list)) 
  sent=" ".join(word_list)
  sent = re.sub(r"[<\*>]", " ",sent)

  return sent
        
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

def group_texts(examples):
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {
      k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
      }
  result["labels"] = result["input_ids"].copy()

  return result

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/english/counts_1grams.txt


In [None]:
dataset_path = root_dir + 'Datasets/' + params['task_name'] + '/'

train_data,valid_data,test_data=load_data_own_gen(data_path=dataset_path)

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(valid_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
# tokenize dataset
train_tokenized = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=train_dataset.column_names,
)
eval_tokenized = eval_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eval_dataset.column_names,
)

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

In [None]:
# Chunking texts for batching
block_size = params['block_size']
train_batched = train_tokenized.map(group_texts, batched=True, num_proc=4)
eval_batched = eval_tokenized.map(group_texts, batched=True, num_proc=4)

# prepare tokenizer for data pre-processing
tokenizer.pad_token = tokenizer.eos_token

Map (num_proc=4):   0%|          | 0/7632 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1908 [00:00<?, ? examples/s]

# Training

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_batched,
    eval_dataset=eval_batched,
    data_collator=data_collator,
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,4.618474
2,5.476900,4.012087
3,4.178400,3.475358
4,4.178400,3.167502
5,3.582200,3.024085
6,3.252300,2.933497
7,3.068100,2.874885
8,3.068100,2.834181
9,2.964000,2.804716
10,2.895200,2.788702


TrainOutput(global_step=18100, training_loss=2.5564760652826637, metrics={'train_runtime': 18569.7421, 'train_samples_per_second': 1.947, 'train_steps_per_second': 0.975, 'total_flos': 3.357624490996531e+16, 'train_loss': 2.5564760652826637, 'epoch': 50.0})

In [None]:
# remove saved checkpoints
!rm -rf {my_model_name}

### Quick Evaluation for sanity check

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 8.46


## Save Model

In [None]:
if len(params['model_path'].split('/')) > 1:
  params['model_path'] = params['model_path'].split('/')[1]

output_dir = params['save_path'] + params['task_name']+ '_' + params['model_path'] + '/'

tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(params, os.path.join(output_dir, "training_args.bin"))