In [None]:
from tqdm import tqdm
import torch
import argparse
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
import time
import os
import pickle as pkl
# use AdamW is a standard practice for transformer
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
# use Adafactor is the default setting for T5
from transformers.optimization import Adafactor
from openprompt.data_utils.utils import InputExample
from openprompt import PromptDataLoader
from openprompt.prompts import ManualVerbalizer
from openprompt.prompts import SoftTemplate
from openprompt import PromptForClassification
from openprompt.utils.reproduciblity import set_seed
from openprompt.plms import load_plm
from openprompt.data_utils.data_processor import DataProcessor

In [None]:
parser = argparse.ArgumentParser("")
parser.add_argument("--shot", type=int, default=-1) # few-shot learning，-1 means full-shot
parser.add_argument("--seed", type=int, default=144)
parser.add_argument(
    "--plm_eval_mode",
    action="store_true",
    help="whether to turn off the dropout in the freezed model. Set to true to turn off.")
# turn off the dropout aims to compare different methods under the same setting
parser.add_argument("--tune_plm", action="store_true")
parser.add_argument(
    "--model",
    type=str,
    default='t5',
    help="We test both t5 and t5-lm in this scripts, the corresponding tokenizerwrapper will be automatically loaded.") # the model to be tested
parser.add_argument("--model_name_or_path", default='t5-small')
parser.add_argument("--template_id", type=int, default=0)
parser.add_argument("--verbalizer_id", type=int, default=0) # verbalizer connects the label space and the prompt space
parser.add_argument("--dataset", type=str, default='HateXplain')
parser.add_argument("--result_file", type=str, default="sfs_out/results.txt")
parser.add_argument("--max_steps", default=2000, type=int)
parser.add_argument("--prompt_lr", type=float, default=0.3)
parser.add_argument("--warmup_step_prompt", type=int, default=100)
parser.add_argument("--init_from_vocab", action="store_false")
parser.add_argument("--eval_every_steps", type=int, default=200)
parser.add_argument("--soft_token_num", type=int, default=20) # signify the number of soft tokens in soft template
parser.add_argument("--optimizer", type=str, default="Adafactor")
args = parser.parse_args()

In [None]:
class MyDataProcessor(DataProcessor):
    def __init__(self):
        super().__init__()
        self.labels = ["No", "Yes"]

    def random_sampling(self, dataset, sample_num): # sample a subset of the dataset
        import random
        random.seed(42)
        # if exceed training data size, set to training data size, then;
        sample_num = min(sample_num, len(dataset["text"]))
        index_list = list(range(len(dataset["text"])))
        random.shuffle(index_list)
        selected_index_list = index_list[:sample_num]
        new_dataset = {
            "id": [dataset["id"][i] for i in selected_index_list],
            "text": [dataset["text"][i] for i in selected_index_list],
            "label": [dataset["label"][i] for i in selected_index_list]
        }
        return new_dataset

    def get_examples(self, data_dir, split): # load the dataset
        if split == "valid" or split == "dev": # unify the name of validation set
            split = "validation" 

        dataset = json.loads(open(data_dir).read())
        dataset = dataset[split]
        # sample_num = 1000
        sample_num = -1
        if sample_num != -1 and split == "train":
            dataset = self.random_sampling(dataset, sample_num)
            print("%s, sample %d data." % (split, len(dataset["id"])))
        return self.transform(dataset)

    def transform(self, dataset):
        res = []
        for i in range(len(dataset["text"])):
            text_a = dataset['text'][i]
            label = int(dataset['label'][i])
            # for hateXplain
            if label == 2:
                label = 1
            guid = "{}".format(dataset['id'][i])

            res.append(InputExample(guid=guid, text_a=text_a, label=label))
            # create an InputExample object for each data point
            # guid: a unique id for each data point; text_a: the input text; label: the label
        return res



In [None]:
# load the specified pre-trained language model (such as T5) and its corresponding tokenizer, model configuration and WrapperClass
plm, tokenizer, model_config, WrapperClass = load_plm(args.model, args.model_name_or_path)

dataset = {}

# specify the supported dataset list
dataset_list = [
    "HateXplain",
    "USElectionHate20",
    "HateCheck",
    "SBIC.v2",
    "measuring-hate-speech"]

if args.dataset in dataset_list:
    data_dir = "parsed_dataset/%s_perspective_balance.json" % (args.dataset)
    
    Processor = MyDataProcessor
    # load the dataset and split it into training, validation and test sets    
    dataset['train'] = Processor().get_train_examples(data_dir)
    dataset['validation'] = Processor().get_dev_examples(data_dir)
    dataset['test'] = Processor().get_test_examples(data_dir)
    
    # get the class labels
    class_labels = Processor().get_labels()
    
    max_seq_l = 480
    
    # if you want to fine-tune the whole pre-trained language model, you need more GPU memory, so you need to use a smaller batch size
    if args.tune_plm:
        batchsize_t = 4
        batchsize_e = 4
        gradient_accumulation_steps = 8
        model_parallelize = True
    else:
        if args.model_name_or_path == "gpt2-large":
            batchsize_t = 4
            batchsize_e = 4
            gradient_accumulation_steps = 8
            model_parallelize = False
        elif args.model_name_or_path == "gpt2-medium":
            batchsize_t = 4
            batchsize_e = 4
            gradient_accumulation_steps = 8
            model_parallelize = False
        elif args.model_name_or_path == "gpt2-xl":
            batchsize_t = 2
            batchsize_e = 2
            gradient_accumulation_steps = 16
            model_parallelize = False
        else:
            batchsize_t = 8
            batchsize_e = 8
            gradient_accumulation_steps = 4
            model_parallelize = False
else:
    raise NotImplementedError


In [None]:
mytemplate = SoftTemplate(
    model=plm,
    tokenizer=tokenizer,
    num_tokens=args.soft_token_num,
    initialize_from_vocab=args.init_from_vocab).from_file(
        "experiment_scripts/soft_template/soft_template.txt",
    choice=0) # load the soft template, which is a set of soft tokens
# soft tmplate looks like: "This is a [MASK] sentence." [MASK] is a soft token

myverbalizer = ManualVerbalizer(
    tokenizer,
    classes=class_labels).from_file(
        "experiment_scripts/soft_template/manual_verbalizer.txt",
    choice=args.verbalizer_id)
# verbalizer connects the label space and the prompt space
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
# wrap_one_example is a function that wraps the input example with the soft template and the verbalizer
print(wrapped_example)

# following is the training process,
use_cuda = True
prompt_model = PromptForClassification(
    plm=plm,
    template=mytemplate,
    verbalizer=myverbalizer,
    freeze_plm=(
        not args.tune_plm),
    plm_eval_mode=args.plm_eval_mode)
if use_cuda:
    prompt_model = prompt_model.cuda()

# following load Prompt-based Learning data loader
if model_parallelize:
    prompt_model.parallelize() # parallelize the model to use multiple GPUs
    train_dataloader = PromptDataLoader(
    dataset=dataset["train"],
    template=mytemplate,
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=max_seq_l,
    decoder_max_length=3,
    batch_size=batchsize_t,
    shuffle=True,
    teacher_forcing=False,
    predict_eos_token=False,
    truncate_method="tail") 

test_dataloader = PromptDataLoader(
    dataset=dataset["test"],
    template=mytemplate,
    tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=max_seq_l,
    decoder_max_length=3,
    batch_size=batchsize_e,
    shuffle=False,
    teacher_forcing=False,
    predict_eos_token=False,
    truncate_method="tail")

print(
    "truncate rate: {}".format(
        test_dataloader.tokenizer_wrapper.truncate_rate),
    flush=True)

In [None]:
# following calulate the accuracy, precision, recall and f1 score
def evaluate(prompt_model, dataloader, desc, return_data=False):
    prompt_model.eval()
    allpreds = []
    alllabels = []

    for step, inputs in enumerate(dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        alllabels.extend(labels.cpu().tolist())
        allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    prompt_model.train()
    acc = accuracy_score(alllabels, allpreds) 
    p = precision_score(alllabels, allpreds) 
    r = recall_score(alllabels, allpreds) 
    f1 = f1_score(alllabels, allpreds)
    res = [acc, p, r, f1]
    if not return_data:
        return res
    elif return_data:
        return res, alllabels, allpreds


In [None]:
# define the loss function
loss_func = torch.nn.CrossEntropyLoss()

# if tune_plm is True, fine-tune the pre-trained language model
if args.tune_plm:
    # define the parameters that need to be optimized
    # group 1: the parameters in the pre-trained language model that do not contain "bias" and "LayerNorm.weight", these parameters use weight_decay of 0.01
    # group 2: the parameters in the pre-trained language model that contain "bias" and "LayerNorm.weight", these parameters use weight_decay of 0.0
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters1 = [
        {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    
    # use AdamW optimizer, learning rate is 3e-5
    optimizer1 = AdamW(optimizer_grouped_parameters1, lr=3e-5)
    
    # define the learning rate scheduler, using linear decay strategy
    # parameters:
    #   - optimizer1: optimizer
    #   - num_warmup_steps: warm-up steps, set to 500 here
    #   - num_training_steps: total training steps, i.e. args.max_steps
    scheduler1 = get_linear_schedule_with_warmup(optimizer1, num_warmup_steps=500, num_training_steps=args.max_steps)
else:
    # if tune_plm is False, do not fine-tune the parameters of the pre-trained language model
    optimizer1 = None
    scheduler1 = None

# define the parameters that need to be optimized
# here, the "raw_embedding" parameter is excluded because it does not need to be optimized
optimizer_grouped_parameters2 = [{'params': [p for name, p in prompt_model.template.named_parameters() if 'raw_embedding' not in name]}]

if args.optimizer.lower() == "adafactor":
    # 使用 Adafactor 优化器
    # 参数:
    #   - optimizer_grouped_parameters2: 需要优化的参数组
    #   - lr: 学习率,这里设置为 args.prompt_lr,即 Prompt 的学习率
    #   - relative_step: 是否使用相对步数,这里设置为 False
    #   - scale_parameter: 是否对参数进行缩放,这里设置为 False
    #   - warmup_init: 是否在初始化时进行预热,这里设置为 False
    optimizer2 = Adafactor(optimizer_grouped_parameters2, lr=args.prompt_lr, relative_step=False, scale_parameter=False, warmup_init=False)
    
    # 定义学习率调度器,使用常数预热策略
    # 参数:
    #   - optimizer2: 优化器
    #   - num_warmup_steps: 预热步数,这里设置为 args.warmup_step_prompt
    scheduler2 = get_constant_schedule_with_warmup(optimizer2, num_warmup_steps=args.warmup_step_prompt)
    
elif args.optimizer.lower() == "adamw":
    # 使用 AdamW 优化器
    # 参数:
    #   - optimizer_grouped_parameters2: 需要优化的参数组
    #   - lr: 学习率,这里设置为 args.prompt_lr,即 Prompt 的学习率
    optimizer2 = AdamW(optimizer_grouped_parameters2, lr=args.prompt_lr)
    
    # 定义学习率调度器,使用线性预热策略
    # 参数:
    #   - optimizer2: 优化器
    #   - num_warmup_steps: 预热步数,这里设置为 args.warmup_step_prompt
    #   - num_training_steps: 总的训练步数,即 args.max_steps
    scheduler2 = get_linear_schedule_with_warmup(optimizer2, num_warmup_steps=args.warmup_step_prompt, num_training_steps=args.max_steps)


In [None]:
tot_loss = 0  # 总损失
log_loss = 0  # 记录损失
best_val_acc = 0  # 最佳验证准确率
val_acc_start = 0  # 验证准确率的起始值
glb_step = 0  # 全局步数
actual_step = 0  # 实际步数
leave_training = False  # 是否离开训练
tot_train_time = 0  # 总训练时间
pbar_update_freq = 10  # 进度条更新频率

prompt_model.train()

# 在训练前评估模型在验证集上的性能
val_res = evaluate(prompt_model, test_dataloader, desc="Valid")
print("before training, val_res: ", val_res)

# 创建进度条,总步数为args.max_steps,描述为"Train"
pbar = tqdm(total=args.max_steps, desc="Train")

# 训练循环,当global_step小于等于args.max_steps时继续训练
while glb_step <= args.max_steps:
    # 遍历训练数据集
    for step, inputs in enumerate(train_dataloader):
        if use_cuda:
            inputs = inputs.cuda()
        
        # 记录训练时间
        tot_train_time -= time.time()
        
        # 前向传播,获取模型输出
        logits = prompt_model(inputs)
        
        # 获取标签
        labels = inputs['label']
        
        # 计算损失
        loss = loss_func(logits, labels)
        
        # 反向传播
        loss.backward()
        
        # 累加损失
        tot_loss += loss.item()
        actual_step += 1
        
        # 如果actual_step能够被gradient_accumulation_steps整除,则执行优化步骤
        # 这里是用梯度累积的方式进行训练，即每gradient_accumulation_steps个step更新一次参数
        # 这样能够减少显存的使用,但是会增加训练时间
        if actual_step % gradient_accumulation_steps == 0:
            # 梯度裁剪,最大范数为1.0
            torch.nn.utils.clip_grad_norm_(prompt_model.parameters(), 1.0)
            
            # 更新全局步数
            glb_step += 1
            
            # 如果glb_step能够被pbar_update_freq整除,则更新进度条
            if glb_step % pbar_update_freq == 0:
                # 计算平均损失
                aveloss = (tot_loss - log_loss) / pbar_update_freq
                
                # 更新进度条
                pbar.update(10)
                pbar.set_postfix({'loss': aveloss})
                
                # 重置log_loss
                log_loss = tot_loss

        # 如果优化器不为None,则执行优化步骤
        if optimizer1 is not None:
            optimizer1.step()
            optimizer1.zero_grad()
        if scheduler1 is not None:
            scheduler1.step()
        if optimizer2 is not None:
            optimizer2.step()
            optimizer2.zero_grad()
        if scheduler2 is not None:
            scheduler2.step()

        # 累加训练时间
        tot_train_time += time.time()

        # 如果满足以下条件,则在验证集上评估模型:
        # 1. actual_step能够被gradient_accumulation_steps整除
        # 2. glb_step大于0
        # 3. glb_step能够被args.eval_every_steps整除
        if actual_step % gradient_accumulation_steps == 0 and glb_step > 0 and glb_step % args.eval_every_steps == 0:
            # 在验证集上评估模型
            val_res, labels, preds = evaluate(prompt_model, test_dataloader, desc="Valid", return_data=True)
            
            # 解包验证结果
            acc, p, r, f1 = val_res
            
            # 保存验证结果
            statistics = [args.dataset, args.model_name_or_path, args.seed, glb_step, acc, p, r, f1]
            
            # 打印训练步数、验证准确率和平均训练时间
            print("Glb_step {}, val_acc {}, average time {}".format(glb_step, val_res, tot_train_time / actual_step), flush=True)
            
            # 将模型设置为训练模式
            prompt_model.train()
            
            # 将验证结果保存到文件
            with open("sfs_out/task1/%s_%s_%s_%s.pkl" % (args.dataset, args.model_name_or_path, args.seed, glb_step), "wb") as wf:
                pkl.dump({"statistics": statistics, "labels": labels, "preds": preds}, wf)
            
            # 将验证结果写入CSV文件
            with open("sfs_out/toxic_classification_result.csv", "a") as wf:
                wf.write("%s,%s,%s,%s,%s,%s,%s,%s\n" % (args.dataset, args.model_name_or_path, args.seed, glb_step, acc, p, r, f1))

            # 保存模型
            save_path = "saved_models/%s/%s/%s/" % (args.dataset, args.model, args.model_name_or_path)
            os.makedirs(save_path, exist_ok=True)
            torch.save(prompt_model.state_dict(), "%s%s.ckpt" % (save_path, glb_step))

        # 如果glb_step超过了args.max_steps,则设置leave_training为True,表示要离开训练循环
        if glb_step > args.max_steps:
            leave_training = True
            break
    
    # 如果leave_training为True,则跳出训练循环
    if leave_training:
        break
