# Final - T5_base

In [1]:
!nvidia-smi

Fri Apr 28 20:49:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.08    Driver Version: 510.73.08    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:61:00.0  On |                  N/A |
| 56%   36C    P8    20W / 350W |     10MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
dataset = ["""{
    "content": "世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得”，就需要各方面密切配合、#idiom#。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、#idiom#、灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。",
    "realCount": 2,
    "groundTruth": ["通力合作", "有的放矢"],
    "candidates": [
        ["凭空捏造", "高头大马", "通力合作", "同舟共济", "和衷共济", "蓬头垢面", "紧锣密鼓"],
        ["叫苦连天", "量体裁衣", "金榜题名", "百战不殆", "知彼知己", "有的放矢", "风流才子"]
    ]
}"""]
s = "世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得”，就需要各方面密切配合、通力合作。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、</s>、灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。可选成语有：叫苦连天 | 量体裁衣 | 金榜题名 | 百战不殆 | 知彼知己 | 风流才子，请选出最佳的一项。"

In [3]:
# !pip install transformers
# !pip install sentencepiece

In [4]:
import json
import pandas as pd
import os
import torch
from torch import cuda, nn, optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np

# from google.colab import drive
# drive.mount('/content/drive/')
path = './'

  from .autonotebook import tqdm as notebook_tqdm


# Load model

In [5]:
tokenizer = AutoTokenizer.from_pretrained("mxmax/Chinese_Chat_T5_Base")
model = AutoModelForSeq2SeqLM.from_pretrained("mxmax/Chinese_Chat_T5_Base") 

device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)


Downloading (…)okenizer_config.json: 100%|██████████| 2.36k/2.36k [00:00<00:00, 370kB/s]
Downloading spiece.model: 100%|██████████| 742k/742k [00:02<00:00, 336kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 1.22MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 767/767 [00:00<00:00, 93.9kB/s]
Downloading pytorch_model.bin: 100%|██████████| 990M/990M [08:40<00:00, 1.90MB/s] 


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

# Fine-tuning model 

## Data preprocessing

In [6]:
def prompt1(text, candidates, choice):
    input_text = "请从（）里选择出最合适的成语: " + text
    for i in range(len(choice)):
        candidates_str = '|'.join([c for c in candidates[i]])
        input_text = input_text.replace('#idiom#', "（" + candidates_str + "）", 1) 
    return input_text

In [7]:
def prompt2(text, candidates, choice):
    input_text = text.replace('#idiom#', "_")
    input_text = "请依次从（）选择出最合适的成语填入_: " + input_text
    for i in range(len(choice)):
        candidates_str = '|'.join([c for c in candidates[i]])
        input_text = input_text.replace('（）', "（" + candidates_str + "）（）", 1)
    input_text = input_text.replace("（）", "")
    return input_text

In [8]:
def prompt3(text, candidates, choice):
    ex = '''选择：[[“凭空捏造“, “高头大马“, “通力合作“, “同舟共济“, “和衷共济“, “蓬头垢面“, “紧锣密鼓“],[“叫苦连天“, “量体裁衣“, “金榜题名“, “百战不殆“, “知彼知己“, “有的放矢“, “风流才子“]]\n输入：“世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得“，就需要各方面密切配合、#idiom#。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、#idiom#、灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。”\n输出：通力合作, 有的放矢\n'''
    input_text = ex
    input_text = input_text + f"选择：{candidates}\n输入：{text}\n输出："
    return input_text

In [9]:
def preprocess_data(data, prompt):
    input_texts = []
    labels = []

    for example in data:
        example = json.loads(example)
        
        input_text = example['content']
        ground_truth = example['groundTruth']
        candidates = example['candidates']
        
        input_text = prompt(input_text, candidates, ground_truth)
        input_texts.append(input_text)
        labels.append('、'.join(ground_truth))

    inputs = tokenizer(text=input_texts, return_token_type_ids=False)
    labels = tokenizer(labels, return_token_type_ids=False)
    return inputs, labels

In [10]:
# Load the Chinese Idioms dataset
train_data_file = path+'data/train_20000.txt'
val_data_file = path+'data/dev_3000.txt'



with open(train_data_file, encoding='utf-8', errors='ignore') as f:
    train_data = f.readlines()

with open(val_data_file, encoding='utf-8', errors='ignore') as f:
    val_data = f.readlines()

train_inputs, train_labels = preprocess_data(train_data, prompt2)
val_inputs, val_labels = preprocess_data(val_data, prompt2)

# train_inputs = preprocess_data(train_data)
# val_inputs = preprocess_data(val_data)


In [11]:
print(len(train_inputs))
print(train_inputs[0])
print(train_inputs[0].ids)
print(train_inputs[0].type_ids)
print(train_inputs[0].tokens)

# print(train_inputs[0].offsets)
# print(train_inputs[0].attention_mask)
# print(train_inputs[0].special_tokens_mask)
# print(train_inputs[0].overflowing)

# print("---------------------------------")



2
Encoding(num_tokens=151, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[12, 480, 9434, 57, 21, 25846, 291, 891, 4800, 1013, 2778, 146, 5684, 4800, 503, 237, 30846, 1954, 4800, 1676, 1313, 2903, 2330, 4800, 4453, 10765, 4800, 2050, 151, 36, 1744, 4800, 241, 38, 1546, 2862, 19, 367, 100, 134, 6064, 10581, 4034, 291, 3336, 20, 12, 3336, 10, 2936, 3441, 346, 991, 4115, 11793, 9, 5488, 24574, 94, 6, 1580, 1195, 31561, 5349, 6523, 145, 203, 364, 10, 222, 76, 1368, 9, 169, 123, 3132, 9, 70, 1335, 5488, 12963, 2847, 6, 107, 175, 683, 512, 3762, 38, 13028, 10, 5517, 9, 70, 8164, 7043, 10, 241, 362, 21258, 501, 4659, 43, 13758, 15779, 9, 295, 634, 43, 18, 23562, 1102, 2257, 9, 5406, 3733, 35, 7344, 27860, 5151, 6, 12549, 2810, 1193, 6014, 1003, 80, 22, 19188, 21236, 9, 14, 1010, 3145, 932, 596, 1359, 10328, 113, 2337, 2625, 27, 29527, 5635, 716, 1639, 5635, 1204, 69, 6, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
class IdiomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        # target_ids = self.inputs['input_ids'][idx]

        target_ids = self.labels['input_ids'][idx]
        target_attention_mask = self.labels['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "label_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_label = [torch.LongTensor(example['label_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_label, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=0)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}

def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

In [13]:
train_dataset = IdiomDataset(train_inputs, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_fn, shuffle=True)

val_dataset = IdiomDataset(val_inputs, val_labels)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn, shuffle=False)


## Train model

In [14]:
def train(model:nn.Module, train_loader:DataLoader, optimizer:optim.Optimizer, log_step=100):
    model.train()
    epoch_loss = 0.0
    log_loss = 0.0
    for idx, batch in enumerate(train_loader):
        # print("idx:", idx, ", log_step: ", log_step)
        optimizer.zero_grad()
        batch = to_device(batch, device)
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        log_loss += loss.item()

        wandb.log({"batch": idx, "train loss": loss.item()})
        wandb.log({"batch": idx, "acc train loss": log_loss})
        
        if (idx+1) % log_step == 0:
            print(f"Train Step: {idx} Loss: {log_loss / log_step}")
            log_loss = 0.0
    return epoch_loss / len(train_loader)
        

@torch.no_grad()
def evaluate(model:nn.Module, eval_loader:DataLoader):
    eval_loss = 0.0
    correct = 0
    total = 0
    model.eval()
    print("eval_loader len:", len(eval_loader))
    for batch in eval_loader:
        batch = to_device(batch, device)
        output = model(**batch)
        loss = output.loss
        eval_loss += loss.item()
        pred = output.logits.argmax(-1)
        label = batch["labels"]
        correct += torch.where(label!=0, pred==label, 0).sum().item()
        total += torch.sum(label!=0).item()

    eval_acc = correct / total
    eval_loss = eval_loss / len(eval_loader) 
    print(total, correct)
    return eval_acc, eval_loss

In [15]:
# !pip3 install wandb
import wandb
wandb.login()
wandb.init(
    # set the wandb project where this run will be logged
    project="zootopia",
    
    # track hyperparameters and run metadata
    config={
        "epochs": 5,
        # "learning_rate": lr,
    }
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[34m[1mwandb[0m: Currently logged in as: [33msherryw000701[0m ([33mzootopia[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
epoches = 5
optimizer = optim.Adam(model.parameters(), lr=2e-5)
model.train()

for epoch in range(1, epoches+1):
    print(f"Training Epoch {epoch}")
    train_loss = train(model, train_loader, optimizer)
    print(f"Epoch {epoch} Training Loss: {train_loss}")
    eval_acc, eval_loss = evaluate(model, val_loader)
    # wandb.log({"epoch": epoch, "Eval Acc:": eval_acc, "Eval Loss:": eval_loss})
    print(f"Epoch {epoch} Eval Acc: {eval_acc}; Eval Loss: {eval_loss}")
wandb.finish()
torch.save(model.state_dict(), path+"T5_base_prompt2_ckpt.pt")

Training Epoch 1
Train Step: 99 Loss: 4.878033311367035
Train Step: 199 Loss: 1.4283921557664871
Train Step: 299 Loss: 0.7630102127790451
Train Step: 399 Loss: 0.6469499254226685
Train Step: 499 Loss: 0.5818793734908104
Train Step: 599 Loss: 0.5220415878295899
Train Step: 699 Loss: 0.5061475309729576
Train Step: 799 Loss: 0.4799381259083748
Train Step: 899 Loss: 0.4746051618456841
Train Step: 999 Loss: 0.4387000244855881
Train Step: 1099 Loss: 0.45523271188139913
Train Step: 1199 Loss: 0.43313272431492805
Train Step: 1299 Loss: 0.42965507090091704
Train Step: 1399 Loss: 0.4259946548938751
Train Step: 1499 Loss: 0.41849071711301805
Train Step: 1599 Loss: 0.3899089935421944
Train Step: 1699 Loss: 0.40661712050437926
Train Step: 1799 Loss: 0.39001445546746255
Train Step: 1899 Loss: 0.39182354673743247
Train Step: 1999 Loss: 0.37811650335788727
Train Step: 2099 Loss: 0.3912125040590763
Train Step: 2199 Loss: 0.36547838494181634
Train Step: 2299 Loss: 0.386495693475008
Train Step: 2399 Loss

0,1
acc train loss,██▃▂▅▄▂▁▃▃▅▄▂▂▄▂▁▃▃▁▄▂▂▄▃▂▁▃▂▁▃▂▁▃▂▂▃▃▂▁
batch,▁▂▃▄▅▆▇█▁▂▃▄▅▆▇▇▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█
train loss,█▄▃▂▃▄▂▂▂▂▂▂▁▂▁▂▂▂▂▁▂▂▁▁▁▁▂▁▂▁▂▁▁▁▁▁▂▁▁▂

0,1
acc train loss,18.62672
batch,2499.0
train loss,0.1988


## Evaluation

In [17]:
@torch.no_grad()
def fill_idiom(model, loader):

    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, return_dict_in_generate=True, pad_token_id=0, max_length=512, top_k=15)
        truncated_outputs = []

        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs["sequences"]])
        gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])

        for gold, decode in zip(gold_texts, decode_texts):
            l = set(gold.replace(" ", "").replace("</s>", "").split("、"))
            p = set(decode.replace(" ", "").replace("</s>", "").split("、"))
            all_labels.append(l)
            all_preds.append(p)
        # print(decode_texts)
        # print(gold_texts)
        # break
    
    return all_preds, all_labels

def f1_score(sys, gold):
    tp = 0
    t = 0
    p = 0
    for s, g in zip(sys, gold):
        t += len(g)
        p += len(s)
        tp += len(g & s)
    precision = tp / p if p != 0 else 0
    recall = tp / t if t != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return precision, recall, f1, tp

In [18]:
# validation set
# model.load_state_dict(torch.load(path+"T5_base_prompt2_ckpt.pt", map_location=device))
# sys, gold = fill_idiom(model, val_loader)
# p, r, f1, tp = f1_score(sys, gold)

# print(f"Accuracy for Validation set is {f1}")
# print(f"Accuracy for Validation set is {tp}")

In [19]:
# Load the Chinese Idioms dataset For Test set
test_data_file = path+'data/test_3000.txt'

with open(test_data_file, encoding='utf-8', errors='ignore') as f:
    test_data = f.readlines()

test_inputs, test_labels  = preprocess_data(test_data,prompt2)
test_dataset = IdiomDataset(test_inputs, test_labels)
test_loader = DataLoader(test_dataset, batch_size=256, collate_fn=collate_fn, shuffle=True)

In [20]:
# test set
model.load_state_dict(torch.load(path+"T5_base_prompt2_ckpt.pt", map_location=device))
sys, gold = fill_idiom(model, test_loader)
p, r, f1, tp = f1_score(sys, gold)

print((f"F1 score for Test set is {f1}"))
print(f"Accuracy for Test set is {tp}")

F1 score for Test set is 0.5811868507186566
Accuracy for Test set is 2042


In [21]:
print(tokenizer.decode([12]))
tokenizer.decode([12, 31200, 2057, 1498, 1278, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])




'嗤恶痛绝</s><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

# Prompt engineering

## Load model

In [22]:
tokenizer = AutoTokenizer.from_pretrained("mxmax/Chinese_Chat_T5_Base")
model = AutoModelForSeq2SeqLM.from_pretrained("mxmax/Chinese_Chat_T5_Base") 

device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

## Data processing

In [23]:
def prompt1(text, candidates, choice):
    input_text = "请从（）里选择出最合适的成语: " + text
    for i in range(len(choice)):
        candidates_str = '|'.join([c for c in candidates[i]])
        input_text = input_text.replace('#idiom#', "（" + candidates_str + "）", 1) 
    return input_text

In [24]:
def prompt2(text, candidates, choice):
    input_text = "请依次从（）选择出最合适的成语填入#idiom#: " + text
    for i in range(len(choice)):
        candidates_str = '|'.join([c for c in candidates[i]])
        input_text = input_text.replace('（）', "（" + candidates_str + "）（）", 1)
    input_text = input_text.replace("（）", "")
    return input_text

In [25]:
def prompt3(text, candidates, choice):
    ex = '''选择：[[“凭空捏造“, “高头大马“, “通力合作“, “同舟共济“, “和衷共济“, “蓬头垢面“, “紧锣密鼓“],[“叫苦连天“, “量体裁衣“, “金榜题名“, “百战不殆“, “知彼知己“, “有的放矢“, “风流才子“]]\n输入：“世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得“，就需要各方面密切配合、#idiom#。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、#idiom#、灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。”\n输出：通力合作, 有的放矢\n'''
    input_text = ex
    input_text = input_text + f"选择：{candidates}\n输入：{text}\n输出："
    return input_text

In [26]:
def preprocess_data(data, prompt):
    input_texts = []
    labels = []

    for example in data:
        example = json.loads(example)
        
        input_text = example['content']
        ground_truth = example['groundTruth']
        candidates = example['candidates']
        
        input_text = prompt(input_text, candidates, ground_truth)
        input_texts.append(input_text)
        labels.append('、'.join(ground_truth))

    inputs = tokenizer(text=input_texts, return_token_type_ids=False)
    labels = tokenizer(labels, return_token_type_ids=False)
    return inputs, labels

In [27]:
# Load the Chinese Idioms dataset For Test set
test_data_file = path+'data/test_3000.txt'
with open(test_data_file, encoding='utf-8', errors='ignore') as f:
    test_data = f.readlines()

In [28]:
test_inputs1, test_labels1 = preprocess_data(test_data, prompt1)
test_dataset1 = IdiomDataset(test_inputs1, test_labels1)
test_loader1 = DataLoader(test_dataset1, batch_size=8, collate_fn=collate_fn, shuffle=True)

In [29]:
test_inputs2, test_labels2 = preprocess_data(test_data, prompt2)
test_dataset2 = IdiomDataset(test_inputs2, test_labels2)
test_loader2 = DataLoader(test_dataset2, batch_size=8, collate_fn=collate_fn, shuffle=True)

In [30]:
test_inputs3, test_labels3  = preprocess_data(test_data, prompt3)
test_dataset3 = IdiomDataset(test_inputs3, test_labels3)
test_loader3 = DataLoader(test_dataset3, batch_size=8, collate_fn=collate_fn, shuffle=True)

In [31]:
print(test_inputs2[0].tokens)
print(test_inputs3[0].tokens)

['▁', '请', '依次', '从', '(', '旷', '日', '持久', '|', '公正', '廉洁', '|', '苦', '口', '婆', '心', '|', '现身', '说法', '|', '白', '日', '做梦', '|', '深入浅出', '|', '肺', '腑', '之', '言', ')', '选择', '出', '最', '合适的', '成语', '填', '入', '#', 'id', 'io', 'm', '#', ':', '▁', '只要', '路过', '的', '旅客', '稍有', '迟', '疑', ',', '或者', '对他们', '的宣传', '单', '多', '看', '几', '眼', ',', '基本上', '这个', '旅客', '就别', '想', '轻松', '脱', '身', '了', ',', '记者', '就在', '9', '月', '3', '日', '接', '站', '时', '目睹', '了', '这样', '一幕', ':', '一个', '学生', '接', '过', '招生', '人员', '递', '来', '的宣传', '单', ',', '只是', '问', '了一下', '“', '你们', '学校', '有没有', '分数', '要求', '?”', '两个', '招生', '人员', '就', '“', '白话', '”', '开了', ',', '一个', '表示', '分数', '都', '好', '说', ',', '只要有', '好', '学', '的精神', ';', '另一个', '则', '#', 'id', 'io', 'm', '#', ',', '大', '讲', '自己', '选择', '的专业', '现在', '收获', '颇', '丰', ';', '最后', '在', '招生', '人员', '“', '我们学校', '毕业后', '可以', '完全', '解决', '就业', '”', '的', '忽悠', '下', ',', '这个', '学生', '旅客', '被', '他们', '拉', '上了', '到', '校', '参观', '的', '班车', '。', '</s>']
['▁', '选择', ':', '[', 

### use the pre-trianed model as based model

In [32]:
# prompt engineering for test set (prompt1)
# model.load_state_dict(torch.load(path+"T5_base_prompt2_ckpt.pt.pt", map_location=device))
sys1, gold1 = fill_idiom(model, test_loader1)
p1, r1, f11, tp1 = f1_score(sys1, gold1)
print((f"F1 score for Test set is {f11}"))
print(f"Accuracy for Test set is {tp1}")

# prompt engineering for test set (prompt2)
sys2, gold2 = fill_idiom(model, test_loader2)
p2, r2, f12, tp2 = f1_score(sys2, gold2)
print((f"F1 score for Test set is {f12}"))
print(f"Accuracy for Test set is {tp2}")

# prompt engineering for test set (prompt3)
sys3, gold3 = fill_idiom(model, test_loader3)
p3, r3, f13, tp3 = f1_score(sys3, gold3)
print((f"F1 score for Test set is {f13}"))
print(f"Accuracy for Test set is {tp3}")

F1 score for Test set is 0.03068306343601607
Accuracy for Test set is 126
F1 score for Test set is 0.08442812982998456
Accuracy for Test set is 437
F1 score for Test set is 0
Accuracy for Test set is 0


### use the fine-tune model as based model

In [33]:
model.load_state_dict(torch.load(path+"T5_base_prompt2_ckpt.pt", map_location=device))
# prompt engineering for test set (prompt1)
sys1, gold1 = fill_idiom(model, test_loader1)
p1, r1, f11, tp1 = f1_score(sys1, gold1)
print((f"F1 score for Test set is {f11}"))
print(f"Accuracy for Test set is {tp1}")

# prompt engineering for test set (prompt2)
sys2, gold2 = fill_idiom(model, test_loader2)
p2, r2, f12, tp2 = f1_score(sys2, gold2)
print((f"F1 score for Test set is {f12}"))
print(f"Accuracy for Test set is {tp2}")

# prompt engineering for test set (prompt3)
sys3, gold3 = fill_idiom(model, test_loader3)
p3, r3, f13, tp3 = f1_score(sys3, gold3)
print((f"F1 score for Test set is {f13}"))
print(f"Accuracy for Test set is {tp3}")

F1 score for Test set is 0.3640873015873016
Accuracy for Test set is 1468
F1 score for Test set is 0.4637722419928826
Accuracy for Test set is 1629
F1 score for Test set is 0.001328374070138151
Accuracy for Test set is 5


In [34]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
# import torch
# from torch import cuda
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# tokenizer = AutoTokenizer.from_pretrained("mxmax/Chinese_Chat_T5_Base")
# model = AutoModelForSeq2SeqLM.from_pretrained("mxmax/Chinese_Chat_T5_Base") 
# device = 'cuda' if cuda.is_available() else 'cpu'
# model.to(device)
def postprocess(text):
    return text.replace(".", "").replace('</>','')

def answer_fn(text, top_k=50):
    encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=256, return_tensors="pt").to(device) 
    out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_length=512,temperature=0.5,do_sample=True,repetition_penalty=3.0 ,top_k=top_k)
    result = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
    return postprocess(result[0])

x1 = """世锦赛的整体水平远高于亚洲杯，要如同亚洲杯那样“鱼与熊掌兼得”，就需要各方面密切配合、（凭空捏造|高头大马|通力合作|同舟共济|和衷共济|蓬头垢面|紧锣密鼓）。作为主帅的俞觉敏，除了得打破保守思想，敢于破格用人，还得巧于用兵、(叫苦连天|量体裁衣|金榜题名|百战不殆|知彼知己|风流才子)、
灵活排阵，指挥得当，力争通过比赛推新人、出佳绩、出新的战斗力。"""

# y1 = ["高头大马", "叫苦连天"]

result=answer_fn(x1, top_k=50)
print("模型生成:",result)
print('*'*100)


模型生成: 通力合作、紧锣密鼓
****************************************************************************************************
