# mixed template的使用
mixed template是写template的一种方法
{"soft"}方法是除了mask的一种token 在训练时也需要更新 可以指定初始化值
同时{"soft"}的参数和模型的参数单独更新

In [1]:
from datasets import load_dataset,load_from_disk
import os
import jsonlines

In [2]:
data_path = '/home/wy/datasets/huggingface_datasets/'
cb_data_path = os.path.join(data_path, 'super_glue')
cb_data_path

'/home/wy/datasets/huggingface_datasets/super_glue'

In [3]:
raw_dataset = load_dataset(cb_data_path,'cb')

Reusing dataset super_glue (/home/wy/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 56
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
})

In [5]:
from openprompt.data_utils import InputExample

In [6]:
dataset = dict()
for split in ['train', 'validation', 'test']:
    dataset[split] = list()
    for data in raw_dataset[split]:
        input_example = InputExample(text_a=data['premise'], text_b=data['hypothesis'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)

In [7]:
dataset['train'][0]

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
  "text_b": "the language was peeled down",
  "tgt_text": null
}

In [8]:
from openprompt.plms import load_plm

In [9]:
plm, tokenizer, model_config, WrapperClass = load_plm('t5', 't5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# MixedTemplate
使用自定义的template而非手动定义的template
使用{"soft"}关键词 如果指定了value值 就使用value值初始化 否则会随机初始化token
mask部分必然在训练过程中要被更新推理；而soft token也要被更新 而且是单独地更新

In [10]:
from openprompt.prompts import MixedTemplate

In [11]:
my_template1 = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft": "Question"} {"placeholder":"text_b"}? Is it correct? {"mask"}.')
my_template = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"mask"}.')

In [13]:
wrapped_example = my_template.wrap_one_example(dataset['train'][0])
wrapped_example

[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
   'soft_token_ids': 0,
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': '', 'soft_token_ids': 1, 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '', 'soft_token_ids': 2, 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '', 'soft_token_ids': 3, 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': ' the language was peeled down',
   'soft_token_ids': 0,
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': '', 'soft_token_ids': 4, 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '<mask>', 'soft_token_ids': 0, 'loss_ids': 1, 'shortenable_ids': 0},
  {'text': '.', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 0}],
 {'guid': 0, 'label': 0}]

In [14]:
wrapped_t5tokenizer= WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer, truncate_method="head")

In [15]:
from openprompt import  PromptDataLoader

In [16]:
train_loader = PromptDataLoader(dataset=dataset['train'], template=my_template, tokenizer=tokenizer,
                                tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False, truncate_method="head")

tokenizing: 250it [00:00, 814.87it/s]


In [17]:
from openprompt.prompts import ManualVerbalizer
import torch

In [18]:
my_verbalizer = ManualVerbalizer(tokenizer=tokenizer, num_classes=3, label_words=[["yes"], ["no"], ["maybe"]])
# 数据集中的idx 0->蕴含(True) 1->矛盾(False) 2->中立(Maybe) 所以要与数据集的idx对应 就用"yes", "no", "maybe"进行处理

In [19]:
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm, template=my_template, verbalizer=my_verbalizer, freeze_plm=False)

In [20]:
if use_cuda:
    prompt_model.cuda(device="cuda:0")

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [22]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# bias和layernorm weight的weight decay为0是一个不错的trick

In [26]:
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 更新template的参数
由于template中使用了{"soft"} 需要更新参数

In [27]:
optimizer_grouped_parameters2 = [
    {'params': [p for n,p in prompt_model.template.named_parameters() if "raw_embedding" not in n]}
]

In [28]:
optimizer1 = AdamW(optimizer_grouped_parameters1, lr=1e-4)
optimizer2 = AdamW(optimizer_grouped_parameters2, lr=1e-3)



In [29]:
for epoch in range(10):
    total_loss = 0
    for step, inputs in enumerate(train_loader):
        # print(inputs)
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        total_loss = total_loss + loss.item()
        optimizer1.step()
        optimizer1.zero_grad()
        optimizer2.step()
        optimizer2.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, total_loss/(step+1)), flush=True)


Epoch 0, average loss: 1.2346927225589752
Epoch 1, average loss: 0.061103944666683674
Epoch 2, average loss: 0.0021116542629897594
Epoch 3, average loss: 0.009901425335556269
Epoch 4, average loss: 0.002038600061496254
Epoch 5, average loss: 0.00015941643505357206
Epoch 6, average loss: 0.000683567437590682
Epoch 7, average loss: 0.000541612840606831
Epoch 8, average loss: 4.772607462655287e-05
Epoch 9, average loss: 0.00011519958388817031


In [30]:
validation_dataloader = PromptDataLoader(dataset=dataset['validation'], template=my_template, tokenizer=tokenizer,
                                         tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                         batch_size=4, shuffle=False, teacher_forcing=False, predict_eos_token=False,
                                         truncate_method="head")

tokenizing: 56it [00:00, 507.43it/s]


In [31]:
all_preds = list()
all_labels = list()

In [32]:
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    lables = inputs['label']
    all_labels.extend(lables.cpu().tolist())
    all_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

In [33]:
acc = sum([int(i==j) for i,j in zip(all_preds, all_labels)])/len(all_preds)
acc

0.8928571428571429