# SoftVerbalizer的使用
SoftVerbalizer api不同于ManualVerbalizer方法 其是可以更新的映射关系的一种
可以使用label_words进行初始化 也可以不初始化
最终注意要更新Verbalizer部分的参数

In [44]:
from openprompt.data_utils.text_classification_dataset import AgnewsProcessor

In [46]:
data_path = '/home/wy/OpenPrompt/datasets/TextClassification/agnews'
data_path

'/home/wy/OpenPrompt/datasets/TextClassification/agnews'

In [47]:
dataset = dict()
dataset['train'] = AgnewsProcessor().get_train_examples(data_dir=data_path)
# ag_news数据集输入一段文本(text_a:题目 text_b:内容) 输出为新闻的分类 0->world 1->sports 2->business 3->sci/tech

In [48]:
len(dataset['train'])

120000

In [49]:
dataset['train'][0]

{
  "guid": "0",
  "label": 2,
  "meta": {},
  "text_a": "Wall St. Bears Claw Back Into the Black (Reuters)",
  "text_b": "Reuters - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.",
  "tgt_text": null
}

In [50]:
from openprompt.data_utils.data_sampler import FewShotSampler

In [51]:
sampler = FewShotSampler(num_examples_per_label=16, num_examples_per_label_dev=16, also_sample_dev=True)
dataset['train'], dataset['validation'] = sampler(dataset['train'])

In [52]:
len(dataset['train'])

64

In [53]:
len(dataset['validation'])

64

In [54]:
dataset['test'] = AgnewsProcessor().get_test_examples(data_dir=data_path)

In [55]:
len(dataset['test'])

7600

In [56]:
from openprompt.plms import load_plm

In [57]:
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [58]:
from openprompt.prompts import ManualTemplate

In [59]:
my_template = ManualTemplate(tokenizer=tokenizer, text='{"placeholder":"text_a"} {"placeholder":"text_b"} this is a kind of {"mask"} news.')

In [60]:
wrapped_example = my_template.wrap_one_example(dataset['train'][0])
wrapped_example

[[{'text': 'Japanese Leader Reshuffles Cabinet (AP)',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': ' AP - Prime Minister Junichiro Koizumi replaced key ministers and ruling party leaders in a shuffle Monday aimed at solidifying his power and building momentum for his troubled reforms program.',
   'loss_ids': 0,
   'shortenable_ids': 1},
  {'text': ' this is a kind of', 'loss_ids': 0, 'shortenable_ids': 0},
  {'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0},
  {'text': ' news.', 'loss_ids': 0, 'shortenable_ids': 0}],
 {'guid': '40948', 'label': 0}]

In [61]:
from openprompt import  PromptDataLoader

In [62]:
train_loader = PromptDataLoader(dataset=dataset['train'], template=my_template, tokenizer=tokenizer,
                                tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False, truncate_method="head")

tokenizing: 64it [00:00, 818.14it/s]


# 自定义verbalizer
在新闻文本分类任务中，输入的目标需要自定义 即 0->politics 1->sports 2->business 3->sci/tech
可以提供label_words也可以不提供
注意verbalizer部分的参数也需要更新

In [63]:
from openprompt.prompts import SoftVerbalizer
import torch

In [64]:
my_verbalizer = SoftVerbalizer(tokenizer=tokenizer, model=plm, num_classes=4, label_words=[["politics", "world"], ['sports'], ["business"], ["technology", "scientific"]])

Label word for a class is a list, only use the first word.
Label word for a class is a list, only use the first word.
Label word for a class is a list, only use the first word.
Label word for a class is a list, only use the first word.


In [65]:
from openprompt import PromptForClassification
use_cuda = True
prompt_model = PromptForClassification(plm=plm, template=my_template, verbalizer=my_verbalizer, freeze_plm=False)

In [66]:
if use_cuda:
    prompt_model.cuda(device="cuda:0")

In [67]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [68]:
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# bias和layernorm weight的weight decay为0是一个不错的trick

In [69]:
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [70]:
optimizer_grouped_parameters2 = [
    {'params': prompt_model.verbalizer.group_parameters_1, "lr":3e-5},
    {'params': prompt_model.verbalizer.group_parameters_2, "lr":3e-4}
]

In [71]:
optimizer1 = AdamW(optimizer_grouped_parameters1, lr=3e-5)
optimizer2 = AdamW(optimizer_grouped_parameters2)



In [72]:
for epoch in range(5):
    tot_loss = 0
    for step, inputs in enumerate(train_loader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer1.step()
        optimizer1.zero_grad()
        optimizer2.step()
        optimizer2.zero_grad()
        print(tot_loss/(step+1))

16.560626983642578
27.742273330688477
22.962292671203613
21.80924916267395
17.44739933013916
15.008693099021912
13.449524777276176
14.033050328493118
12.563024255964491
12.50866894721985
11.63405028256503
10.664546092351278
9.844750748696523
9.144311500879537
8.534802080357137
8.015780330788402
5.521378993988037
3.0807735323905945
2.6750882466634116
2.0063161924481387
1.6588308870792385
2.0093927433093386
1.7230449751950798
1.5076643532956948
1.388514939850817
1.2496634458657352
1.1368506084704262
1.7088437795561429
2.0403250840922387
1.8966455596299576
1.7702025223212938
2.0603637814929243
0.0
0.0
5.015214749922355e-05
0.00013770697842119262
0.00011889690504176542
0.15037505621997602
0.6275872230128569
0.5491388201362497
0.6485014676735672
0.5842461179381644
0.7499025798824732
0.6875236996390109
0.9157829439069386
0.8503698764850144
0.7936785513860135
0.7440736419243876
0.0
2.9802318834981634e-08
1.9868212556654424e-08
2.5331928377170243e-07
2.0861589113962964e-07
1.7384657594969136e-

In [73]:
validation_dataloader = PromptDataLoader(dataset=dataset['validation'], template=my_template, tokenizer=tokenizer,
                                         tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
                                         batch_size=4, shuffle=False, teacher_forcing=False, predict_eos_token=False,
                                         truncate_method="head")

tokenizing: 64it [00:00, 774.04it/s]


In [74]:
prompt_model.eval()

PromptForClassification(
  (prompt_model): PromptModel(
    (plm): T5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear(in_features=768, out_features=768, bias=False)
                  (k): Linear(in_features=768, out_features=768, bias=False)
                  (v): Linear(in_features=768, out_features=768, bias=False)
                  (o): Linear(in_features=768, out_features=768, bias=False)
                  (relative_attention_bias): Embedding(32, 12)
                )
                (layer_norm): T5LayerNorm()
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (1): T5LayerFF(
                (DenseReluDense): T5DenseReluDense(
                  (wi): Linear(in_

In [75]:
val_all_preds = list()
val_all_labels = list()

In [76]:
for step, inputs in enumerate(validation_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    val_all_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    # [batch, (1)]
    val_all_labels.extend(labels.cpu().tolist())
    # [batch, (1)]

In [77]:
val_acc = sum([int(i==j) for i,j in zip(val_all_preds, val_all_labels)])/len(val_all_preds)
val_acc

0.84375

In [78]:
test_dataloader = PromptDataLoader(dataset=dataset["test"], template=my_template, tokenizer=tokenizer,
    tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
    batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head")

tokenizing: 7600it [00:08, 874.64it/s]


In [79]:
test_all_preds = list()
test_all_labels = list()

In [80]:
for step, inputs in enumerate(test_dataloader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    labels = inputs['label']
    test_all_preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
    # [batch, (1)]
    test_all_labels.extend(labels.cpu().tolist())
    # [batch, (1)]

In [81]:
test_acc = sum([int(i==j) for i,j in zip(test_all_preds, test_all_labels)])/len(test_all_preds)
test_acc

0.8189473684210526