# openprompt基本使用
1. plm,tokenizer和Wrapper的加载 分别代表预训练模型, tokenizer和包装类 --- api:load_plm
2. 构建template 即prompt learning中的template 填空或者生成 --- api ManualTemplate
3. 构建dataset 导入dataset后使用template,tokenizer和Wrapper进行dataloader构建 --- api PromptDataLoader
4. 构建Verbalizer 是word和label的映射关系 即将模型推理的结果(生成的work token)和label(分类或者token)进行匹配 --- api ManualVerbalizer
5. 构建openprompt pipeline 使用plm, template, verbalizer就可以构建pipeline进行任务 --- api PromptForXXX
6. 模型训练

In [93]:
from datasets import load_dataset,load_from_disk
import os
import jsonlines

# 加载数据
1. load_dataset加载huggingface_datasets
2. 使用openprompt的InputExample api将输入表示为InputExample对象

In [12]:
data_path = '/home/wy/datasets/huggingface_datasets/'
cb_data_path = os.path.join(data_path, 'super_glue')
cb_data_path

'/home/wy/datasets/huggingface_datasets/super_glue'

In [21]:
raw_dataset = load_dataset(cb_data_path,'cb')

Reusing dataset super_glue (/home/wy/.cache/huggingface/datasets/super_glue/cb/1.0.2/d040c658e2ddef6934fdd97deb45c777b6ff50c524781ea434e7219b56a428a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 56
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 250
    })
})

In [7]:
from openprompt.data_utils import InputExample

In [24]:
dataset = dict()
for split in ['train', 'validation', 'test']:
    dataset[split] = list()
    for data in raw_dataset[split]:
        input_example = InputExample(text_a=data['premise'], text_b=data['hypothesis'], label=int(data['label']), guid=data['idx'])
        dataset[split].append(input_example)

In [25]:
dataset['train'][0]

{
  "guid": 0,
  "label": 0,
  "meta": {},
  "text_a": "It was a complex language. Not written down but handed down. One might say it was peeled down.",
  "text_b": "the language was peeled down",
  "tgt_text": null
}

# 加载Pre-trained Language Model
使用load_plm api加载预训练模型
返回值有
1. model 预训练语言模型PLM
2. tokenizer tokenizer将word转为int表示
3. model_config 模型描述
4. WrapperClass 包装类

In [26]:
from openprompt.plms import load_plm

In [28]:
plm, tokenizer, model_config, WrapperClass = load_plm('t5', 't5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [36]:
tokenizer

PreTrainedTokenizer(name_or_path='t5-base', vocab_size=32100, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_

In [37]:
model_config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translatio

In [38]:
WrapperClass

openprompt.plms.seq2seq.T5TokenizerWrapper

# 构建template
1. 使用template将文本输入构建为自定义或预定义模板(需要模板文本和tokenizer,模板文本的使用类似于"placeholder":"key") ManualTemplate api
2. 实例化一个TokenizerWrapper 将模板化后的输入进行tokenize(需要输入序列长度，输出序列长度, tokenizer) TokenizerWrapper大类
3. 最终将输入文本模板化为prompt希望的方式 之后tokenize为plm的输入

In [29]:
from openprompt.prompts import  ManualTemplate

In [40]:
template_text = '{"placeholder":"text_a"} Question: {"placeholder":"text_b"}? Is it correct? {"mask"}.'
my_template = ManualTemplate(tokenizer=tokenizer, text=template_text)

In [41]:
wrapped_example = my_template.wrap_one_example(dataset['train'][0])
wrapped_example

[[{'text': 'It was a complex language. Not written down but handed down. One might say it was peeled down.',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': ' Question:', 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': ' the language was peeled down',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': '? Is it correct?', 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0},
  {'text': '.', 'loss_ids': 0, 'shortenable_ids': 0}],
 {'guid': 0, 'label': 0}]

In [42]:
from openprompt.plms import T5TokenizerWrapper

In [51]:
wrapped_t5tokenizer = T5TokenizerWrapper(max_seq_length=128, tokenizer=tokenizer, truncate_method='head', decoder_max_length=3)
# decoder_max_length=3的原因是 本任务是分类任务 仅需要向decoder输入<sos> <extra_id_0> <eos>即可

In [52]:
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)

In [53]:
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))

{'input_ids': [94, 47, 3, 9, 1561, 1612, 5, 933, 1545, 323, 68, 14014, 323, 5, 555, 429, 497, 34, 47, 158, 400, 26, 323, 5, 11860, 10, 8, 1612, 47, 158, 400, 26, 323, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'decoder_input_ids': [0, 32099, 0], 'loss_ids': [0, 1, 0]}
['▁It', '▁was', '▁', 'a', '▁complex', '▁language', '.', '▁Not', '▁written

In [54]:
model_inputs = {}
for split in ['train', 'validation', 'test']:
    model_inputs[split] = list()
    for sample in dataset[split]:
        tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example=my_template.wrap_one_example(sample),teacher_forcing=False)
        model_inputs[split].append(tokenized_example)

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


# 构建数据集
PromptDataLoader对象提供了一种torch dataloader风格的api
提供template tokenizer之后就自动将输入转化为了prompt需要的输入

In [55]:
from openprompt import  PromptDataLoader

In [56]:
train_loader = PromptDataLoader(dataset=dataset['train'], template=my_template, tokenizer=tokenizer,
                                tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False, truncate_method="head")

tokenizing: 250it [00:00, 701.81it/s]


# 构建Verbalizer
在分类任务中 plm输出是token 需要将token映射到分类目标上
Verbalizer对象的作用是定义label_words和label的关系 即哪些word代表了哪些class

In [57]:
from openprompt.prompts import ManualVerbalizer
import torch

In [66]:
my_verbalizer = ManualVerbalizer(tokenizer=tokenizer, num_classes=3, label_words=[["yes"], ["no"], ["maybe"]])
# 数据集中的idx 0->蕴含(True) 1->矛盾(False) 2->中立(Maybe) 所以要与数据集的idx对应 就用"yes", "no", "maybe"进行处理

In [67]:
print(my_verbalizer.label_words_ids)

Parameter containing:
tensor([[[4273]],

        [[ 150]],

        [[2087]]])


In [90]:
logits = torch.randn(2, len(tokenizer))
logits.shape

torch.Size([2, 32100])

In [69]:
my_verbalizer.process_logits(logits)

tensor([[-1.1131, -1.0481, -1.1368],
        [-0.6659, -2.6968, -0.8704]])

# prompt learning
1. 使用下载的plm，自定义的template，自定义的verbalizer进行初始化openprompt对象
2. 初始化优化器 损失函数等基本元素
3. 进行训练

In [70]:
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm, template=my_template, verbalizer=my_verbalizer, freeze_plm=False)

In [80]:
if use_cuda:
    prompt_model.cuda(device="cuda:0")

In [72]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [73]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# bias和layernorm weight的weight decay为0是一个不错的trick

In [74]:
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [75]:
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)



In [87]:
for step, inputs in enumerate(train_loader):
    print(inputs)
    break

{"input_ids": [[71, 10, 328, 43, 12, 21, 1038, 1668, 5, 272, 10, 11475, 5, 71, 10, 299, 6, 27, 3382, 34, 31, 7, 1842, 12, 3615, 223, 11, 7444, 145, 34, 261, 12, 36, 6, 3, 76, 107, 6, 250, 3, 76, 107, 6, 13, 7827, 1107, 139, 762, 5, 272, 10, 412, 107, 18, 28848, 5, 11475, 6, 27, 278, 31, 17, 317, 14569, 223, 11, 7444, 19, 24, 600, 3, 9, 1154, 5, 11860, 10, 14569, 223, 11, 7444, 19, 24, 600, 3, 9, 1154, 3, 58, 27, 7, 34, 2024, 58, 32099, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [71, 10, 11, 258, 728, 79, 103, 129, 8160, 6, 79, 278, 31, 17, 43, 8, 579

In [92]:
for epoch in range(10):
    total_loss = 0
    for step, inputs in enumerate(train_loader):
        # print(inputs)
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        total_loss = total_loss + loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, total_loss/(step+1)), flush=True)


Epoch 0, average loss: 0.0002695450384635478
Epoch 1, average loss: 7.067079150147038e-05
Epoch 2, average loss: 3.0098863135208376e-05
Epoch 3, average loss: 4.559551780403126e-05
Epoch 4, average loss: 3.117186588497134e-05
Epoch 5, average loss: 3.692354084705585e-05
Epoch 6, average loss: 8.30306344141718e-05
Epoch 7, average loss: 7.490108782803873e-05
Epoch 8, average loss: 4.3490618736541364e-05
Epoch 9, average loss: 7.101133451214992e-05


In [82]:
validation_dataloader = PromptDataLoader(dataset=dataset['validation'], template=my_template, tokenizer=tokenizer,
                                         tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                         batch_size=4, shuffle=False, teacher_forcing=False, predict_eos_token=False,
                                         truncate_method="head")

tokenizing: 56it [00:00, 650.39it/s]


In [83]:
all_preds = list()
all_labels = list()

In [84]:
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    lables = inputs['label']
    all_labels.extend(lables.cpu().tolist())
    all_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

In [85]:
all_preds[0]

1

In [86]:
all_labels[0]

1

In [88]:
acc = sum([int(i==j) for i,j in zip(all_preds, all_labels)])/len(all_preds)
acc

0.9285714285714286