In [1]:
!pip install --upgrade paddlenlp

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os
import paddle
import paddlenlp

In [5]:
# 自定义数据集
import re

from paddlenlp.datasets import load_dataset

# 清洗无效字符
def clean_text(text):
    text = text.replace("\r", "").replace("\n", "")
    text = re.sub(r"\\n\n", ".", text)
    return text

# 定义读取数据集函数
def read_custom_data(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        labels = [float(d) for d in data[2:]]
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels": labels}
    f.close()

def read_custom_data_test(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels":[]}
    f.close()

In [6]:
%cd /home/aistudio/data

/home/aistudio/data


In [7]:
# load_dataset()创建数据集
# lazy=False，数据集返回为MapDataset类型
# 对训练集和验证集进行预处理
train_ds = load_dataset(read_custom_data, filepath='train.tsv', lazy=False) 
valid_ds = load_dataset(read_custom_data, filepath='validation.tsv', lazy=False)
validZhihu_ds = load_dataset(read_custom_data, filepath='zhihu_validation.tsv', lazy=False)
test_ds = load_dataset(read_custom_data_test, filepath='test.tsv', lazy=False) 

In [8]:
# 加载中文ERNIE 3.0预训练模型和分词器
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-2.0-large-en"   # ERNIE2.0 模型
num_classes = 20  # 20分类任务
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[2022-12-29 14:07:11,962] [    INFO] - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-2.0-large-en'.
[2022-12-29 14:07:11,966] [    INFO] - Model config ErnieConfig {
  "attention_probs_dropout_prob": 0.1,
  "enable_recompute": false,
  "fuse": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "

In [9]:
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length):
    result = tokenizer(text=examples["sentence"], max_seq_len=max_seq_length)
    result["labels"] = examples["labels"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)
train_ds = train_ds.map(trans_func)
valid_ds = valid_ds.map(trans_func)
validZhihu_ds = validZhihu_ds.map(trans_func)
test_ds = test_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
valid_batch_sampler = BatchSampler(valid_ds, batch_size=16, shuffle=False)
validZhihu_batch_sampler = BatchSampler(validZhihu_ds, batch_size=16, shuffle=False)
test_batch_sampler = BatchSampler(test_ds, batch_size=16, shuffle=False)

train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
valid_data_loader = DataLoader(dataset=valid_ds, batch_sampler=valid_batch_sampler, collate_fn=collate_fn)
validZhihu_data_loader = DataLoader(dataset=validZhihu_ds, batch_sampler=validZhihu_batch_sampler, collate_fn=collate_fn)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)

In [10]:
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from paddle.metric import Metric

# 自定义MultiLabelReport评价指标
class MultiLabelReport(Metric):
    """
    AUC and F1 Score for multi-label text classification task.
    """

    def __init__(self, name='MultiLabelReport', average='macro'):
        super(MultiLabelReport, self).__init__()
        self.average = average
        self._name = name
        self.reset()

    # def f1_score(self, y_prob):
    #     '''
    #     Returns the f1 score by searching the best threshhold
    #     '''
    #     best_score = 0
    #     for threshold in [i * 0.01 for i in range(100)]:
    #         self.y_pred = y_prob > threshold
    #         score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #         if score > best_score:
    #             best_score = score
    #             precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #             recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #     return best_score, precison, recall

    def f1_score(self, y_prob):
        '''
        Returns the f1 score by searching the best threshhold
        '''
        thresholds =0
        self.y_pred = y_prob > thresholds
        score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        return score, precison, recall

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.y_prob = None
        self.y_true = None

    def update(self, probs, labels):
        if self.y_prob is not None:
            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
        else:
            self.y_prob = probs.numpy()
        if self.y_true is not None:
            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
        else:
            self.y_true = labels.numpy()

    def accumulate(self):
        # auc = roc_auc_score(
        #     y_score=self.y_prob, y_true=self.y_true, average=self.average)
        f1_score, precison, recall = self.f1_score(y_prob=self.y_prob)
        # return auc, f1_score, precison, recall
        return f1_score, precison, recall
    

    def name(self):
        """
        Returns metric name
        """
        return self._name

In [11]:
import numpy as np
def multilabel_categorical_crossentropy(y_true, y_pred):
    """多标签分类的交叉熵
    说明：y_true和y_pred的shape一致，y_true的元素非0即1，
         1表示对应的类为目标类，0表示对应的类为非目标类。
    警告：请保证y_pred的值域是全体实数，换言之一般情况下y_pred
         不用加激活函数，尤其是不能加sigmoid或者softmax！预测
         阶段则输出y_pred大于0的类。如有疑问，请仔细阅读并理解
         本文。
    假如类别总数为10
    label ：[0,1,0,0,0,0,0,0,0,1]  代表条数据被标注为 2,10 属于 2类也属于10类
    输出也为10类别 输出维度也为10。
    类别从1位置开始0位置代表阈值s就是输出的维度第一个位置是阈值预测
    目标类的分数都大于s，非目标类的分数都小于s
    这里阈值s默认为0故而可忽略只要类从1开始就可
    """
    y_pred = (1 - 2 * y_true) * y_pred
    y_pred_neg = y_pred - y_true * 1e12
    y_pred_pos = y_pred - (1 - y_true) * 1e12


    zeros = paddle.zeros_like(y_pred[..., :1])

    y_pred_neg = paddle.concat((y_pred_neg, zeros), axis=-1)
    y_pred_pos = paddle.concat((y_pred_pos, zeros), axis=-1)


    neg_loss = paddle.logsumexp(y_pred_neg, axis=-1)
    pos_loss = paddle.logsumexp(y_pred_pos, axis=-1)
    return neg_loss + pos_loss

In [12]:
import time
import paddle.nn.functional as F

# AdamW优化器、交叉熵损失函数、自定义MultiLabelReport评价指标
optimizer = paddle.optimizer.AdamW(learning_rate=4e-5, parameters=model.parameters(), weight_decay=0.01)
criterion = multilabel_categorical_crossentropy
metric = MultiLabelReport()

In [13]:
label_vocab = ["Self-direction: thought","Self-direction: action","Stimulation","Hedonism","Achievement","Power: dominance","Power: resources","Face","Security: personal","Security: societal","Tradition","Conformity: rules","Conformity: interpersonal","Humility","Benevolence: caring","Benevolence: dependability","Universalism: concern","Universalism: nature","Universalism: tolerance","Universalism: objectivity"]

In [14]:
import paddle
import numpy as np
import paddle.nn.functional as F

# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    metric.reset()
    losses = []
    results = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        logits = model(input_ids, token_type_ids)
        loss = criterion(labels,logits)
        # probs = F.sigmoid(logits)
        probs = logits
        loss = loss.mean()
        losses.append(loss.numpy())
        metric.update(probs, labels)
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))

    # auc, f1_score, precison, recall = metric.accumulate()
    f1_score, precison, recall = metric.accumulate()
    print("eval loss: %.5f, f1 score: %.5f, precison: %.5f, recall: %.5f" %
          (np.mean(losses), f1_score, precison, recall))
    model.train()
    metric.reset()
    if if_return_results:
        return results
    else:
        return f1_score

In [15]:
%cd /home/aistudio/model

/home/aistudio/model


In [16]:
epochs = 20 # 训练轮次
ckpt_dir = "ernie2.0_ckpt" # 训练过程中保存模型参数的文件夹
save_dir2 = "ernie2.0_ckpt_zhihu"

global_step = 0  # 迭代次数
tic_train = time.time()
best_f1_score = 0
best_f1_score2 = 0

# 模型训练
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率、f1分数
        logits = model(input_ids, token_type_ids)
        
        loss = criterion(labels, logits)
        loss = loss.mean()
        probs = logits
        #probs = F.sigmoid(logits)
        metric.update(probs, labels)
        
        # auc, f1_score, _,  _= metric.accumulate()
        f1_score, _,  _= metric.accumulate()


        # 每迭代100次，打印损失函数值、准确率、f1分数、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        
        #每迭代40次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 40 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, valid_data_loader, label_vocab, if_return_results=False)
            eval_f1_score2 = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
            if eval_f1_score2 > best_f1_score2:
                best_f1_score2 = eval_f1_score2
                model.save_pretrained(save_dir2)
                tokenizer.save_pretrained(save_dir2)

global step 10, epoch: 1, batch: 10, loss: 4.10663, f1 score: 0.21181, speed: 1.63 step/s
global step 20, epoch: 1, batch: 20, loss: 4.07766, f1 score: 0.19203, speed: 1.72 step/s
global step 30, epoch: 1, batch: 30, loss: 4.06239, f1 score: 0.17248, speed: 1.53 step/s
global step 40, epoch: 1, batch: 40, loss: 3.94547, f1 score: 0.16890, speed: 1.61 step/s
eval loss: 3.91192, f1 score: 0.10681, precison: 0.23412, recall: 0.11292
eval loss: 3.69470, f1 score: 0.06602, precison: 0.07659, recall: 0.07246


[2022-12-29 11:16:36,317] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:16:39,506] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:16:39,509] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:16:39,515] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:16:42,619] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:16:42,623] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 50, epoch: 1, batch: 50, loss: 3.82056, f1 score: 0.22150, speed: 0.42 step/s
global step 60, epoch: 1, batch: 60, loss: 3.73289, f1 score: 0.23713, speed: 1.67 step/s
global step 70, epoch: 1, batch: 70, loss: 3.80777, f1 score: 0.26480, speed: 1.54 step/s
global step 80, epoch: 1, batch: 80, loss: 3.58494, f1 score: 0.27662, speed: 1.81 step/s
eval loss: 3.68114, f1 score: 0.26435, precison: 0.42329, recall: 0.22789
eval loss: 3.47785, f1 score: 0.18896, precison: 0.24290, recall: 0.19614


[2022-12-29 11:17:18,011] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:17:34,000] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:17:34,004] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:17:34,009] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:17:46,169] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:17:46,173] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 90, epoch: 1, batch: 90, loss: 3.74324, f1 score: 0.29425, speed: 0.22 step/s
global step 100, epoch: 1, batch: 100, loss: 3.58679, f1 score: 0.30519, speed: 1.65 step/s
global step 110, epoch: 1, batch: 110, loss: 3.37050, f1 score: 0.31331, speed: 1.62 step/s
global step 120, epoch: 1, batch: 120, loss: 3.63459, f1 score: 0.31910, speed: 1.72 step/s
eval loss: 3.61151, f1 score: 0.31005, precison: 0.54439, recall: 0.26087
eval loss: 3.40661, f1 score: 0.23042, precison: 0.30289, recall: 0.24682


[2022-12-29 11:18:21,655] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:18:34,808] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:18:34,890] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:18:35,039] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:18:49,928] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:18:49,932] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 130, epoch: 1, batch: 130, loss: 3.74028, f1 score: 0.36366, speed: 0.22 step/s
global step 140, epoch: 1, batch: 140, loss: 3.46223, f1 score: 0.36220, speed: 1.64 step/s
global step 150, epoch: 1, batch: 150, loss: 3.43084, f1 score: 0.37515, speed: 1.53 step/s
global step 160, epoch: 1, batch: 160, loss: 3.59865, f1 score: 0.37822, speed: 1.66 step/s
eval loss: 3.58387, f1 score: 0.34142, precison: 0.57111, recall: 0.28338
eval loss: 3.44472, f1 score: 0.22652, precison: 0.29378, recall: 0.22960


[2022-12-29 11:19:26,160] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:19:38,059] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:19:38,062] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 170, epoch: 2, batch: 1, loss: 3.39209, f1 score: 0.38175, speed: 0.34 step/s
global step 180, epoch: 2, batch: 11, loss: 3.42437, f1 score: 0.40102, speed: 1.58 step/s
global step 190, epoch: 2, batch: 21, loss: 3.32769, f1 score: 0.42292, speed: 1.74 step/s
global step 200, epoch: 2, batch: 31, loss: 3.31444, f1 score: 0.42379, speed: 1.61 step/s
eval loss: 3.53636, f1 score: 0.35086, precison: 0.59108, recall: 0.29830
eval loss: 3.32879, f1 score: 0.21791, precison: 0.25955, recall: 0.21178


[2022-12-29 11:20:13,969] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:20:28,571] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:20:28,655] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 210, epoch: 2, batch: 41, loss: 3.30145, f1 score: 0.45610, speed: 0.31 step/s
global step 220, epoch: 2, batch: 51, loss: 3.26393, f1 score: 0.45691, speed: 1.64 step/s
global step 230, epoch: 2, batch: 61, loss: 3.26927, f1 score: 0.47441, speed: 1.72 step/s
global step 240, epoch: 2, batch: 71, loss: 3.24475, f1 score: 0.47133, speed: 1.65 step/s
eval loss: 3.55279, f1 score: 0.34391, precison: 0.63858, recall: 0.28220
eval loss: 3.32988, f1 score: 0.22984, precison: 0.25984, recall: 0.22239
global step 250, epoch: 2, batch: 81, loss: 3.28239, f1 score: 0.46843, speed: 0.56 step/s
global step 260, epoch: 2, batch: 91, loss: 3.38605, f1 score: 0.46053, speed: 1.64 step/s
global step 270, epoch: 2, batch: 101, loss: 3.25487, f1 score: 0.46809, speed: 1.63 step/s
global step 280, epoch: 2, batch: 111, loss: 3.41354, f1 score: 0.46623, speed: 1.52 step/s
eval loss: 3.52249, f1 score: 0.35630, precison: 0.57311, recall: 0.29760
eval loss: 3.27932, f1 score: 0.24117

[2022-12-29 11:21:41,040] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:21:52,960] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:21:52,964] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:21:52,969] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:22:05,728] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:22:05,731] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 290, epoch: 2, batch: 121, loss: 3.25182, f1 score: 0.44148, speed: 0.24 step/s
global step 300, epoch: 2, batch: 131, loss: 3.34432, f1 score: 0.47066, speed: 1.69 step/s
global step 310, epoch: 2, batch: 141, loss: 3.14139, f1 score: 0.46936, speed: 1.63 step/s
global step 320, epoch: 2, batch: 151, loss: 3.41236, f1 score: 0.46819, speed: 1.66 step/s
eval loss: 3.50289, f1 score: 0.37304, precison: 0.55918, recall: 0.31240
eval loss: 3.29145, f1 score: 0.24562, precison: 0.26543, recall: 0.25643


[2022-12-29 11:22:41,218] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:22:53,315] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:22:53,319] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:22:53,324] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:23:05,357] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:23:05,361] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 330, epoch: 2, batch: 161, loss: 3.32557, f1 score: 0.48373, speed: 0.24 step/s
global step 340, epoch: 3, batch: 2, loss: 3.07756, f1 score: 0.50958, speed: 1.58 step/s
global step 350, epoch: 3, batch: 12, loss: 2.88560, f1 score: 0.52097, speed: 1.78 step/s
global step 360, epoch: 3, batch: 22, loss: 3.35589, f1 score: 0.52934, speed: 1.70 step/s
eval loss: 3.49279, f1 score: 0.38646, precison: 0.57101, recall: 0.33731
eval loss: 3.36636, f1 score: 0.27316, precison: 0.26696, recall: 0.29261


[2022-12-29 11:23:41,227] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:23:53,258] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:23:53,261] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:23:53,267] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:24:05,343] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:24:05,346] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 370, epoch: 3, batch: 32, loss: 3.06634, f1 score: 0.57185, speed: 0.24 step/s
global step 380, epoch: 3, batch: 42, loss: 2.92001, f1 score: 0.55952, speed: 1.65 step/s
global step 390, epoch: 3, batch: 52, loss: 3.09460, f1 score: 0.56114, speed: 1.69 step/s
global step 400, epoch: 3, batch: 62, loss: 3.19478, f1 score: 0.56589, speed: 1.62 step/s
eval loss: 3.53920, f1 score: 0.38484, precison: 0.59443, recall: 0.33223
eval loss: 3.28543, f1 score: 0.27413, precison: 0.28424, recall: 0.30275


[2022-12-29 11:24:41,216] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:24:53,225] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:24:53,228] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 410, epoch: 3, batch: 72, loss: 2.86062, f1 score: 0.55426, speed: 0.33 step/s
global step 420, epoch: 3, batch: 82, loss: 2.94631, f1 score: 0.56009, speed: 1.67 step/s
global step 430, epoch: 3, batch: 92, loss: 2.91413, f1 score: 0.55659, speed: 1.55 step/s
global step 440, epoch: 3, batch: 102, loss: 3.28858, f1 score: 0.55620, speed: 1.76 step/s
eval loss: 3.53899, f1 score: 0.40394, precison: 0.61883, recall: 0.33932
eval loss: 3.41231, f1 score: 0.27738, precison: 0.28513, recall: 0.30259


[2022-12-29 11:25:29,612] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:25:41,747] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:25:41,750] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:25:41,755] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:25:53,808] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:25:53,811] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 450, epoch: 3, batch: 112, loss: 2.85530, f1 score: 0.53837, speed: 0.24 step/s
global step 460, epoch: 3, batch: 122, loss: 2.95775, f1 score: 0.54486, speed: 1.62 step/s
global step 470, epoch: 3, batch: 132, loss: 2.84000, f1 score: 0.55160, speed: 1.65 step/s
global step 480, epoch: 3, batch: 142, loss: 2.91449, f1 score: 0.55662, speed: 1.56 step/s
eval loss: 3.53916, f1 score: 0.40542, precison: 0.59738, recall: 0.34766
eval loss: 3.36845, f1 score: 0.27820, precison: 0.30936, recall: 0.29785


[2022-12-29 11:26:29,463] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:26:41,478] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:26:41,481] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:26:41,487] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:26:53,711] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:26:53,714] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 490, epoch: 3, batch: 152, loss: 2.96441, f1 score: 0.53566, speed: 0.24 step/s
global step 500, epoch: 3, batch: 162, loss: 3.12547, f1 score: 0.54230, speed: 1.64 step/s
global step 510, epoch: 4, batch: 3, loss: 2.94903, f1 score: 0.54588, speed: 1.87 step/s
global step 520, epoch: 4, batch: 13, loss: 2.83676, f1 score: 0.58563, speed: 1.53 step/s
eval loss: 3.57097, f1 score: 0.45381, precison: 0.55110, recall: 0.41208
eval loss: 3.28349, f1 score: 0.33727, precison: 0.36510, recall: 0.36480


[2022-12-29 11:27:29,641] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:27:43,025] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:27:43,095] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json
[2022-12-29 11:27:43,233] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:27:55,173] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:27:55,177] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 530, epoch: 4, batch: 23, loss: 2.97219, f1 score: 0.65159, speed: 0.23 step/s
global step 540, epoch: 4, batch: 33, loss: 2.52479, f1 score: 0.65269, speed: 1.69 step/s
global step 550, epoch: 4, batch: 43, loss: 2.91261, f1 score: 0.64124, speed: 1.72 step/s
global step 560, epoch: 4, batch: 53, loss: 2.61222, f1 score: 0.63546, speed: 1.58 step/s
eval loss: 3.57695, f1 score: 0.44555, precison: 0.57609, recall: 0.39516
eval loss: 3.28545, f1 score: 0.34214, precison: 0.33663, recall: 0.36806


[2022-12-29 11:28:30,481] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:28:44,548] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:28:44,629] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 570, epoch: 4, batch: 63, loss: 2.74849, f1 score: 0.64587, speed: 0.31 step/s
global step 580, epoch: 4, batch: 73, loss: 2.91046, f1 score: 0.63517, speed: 1.71 step/s
global step 590, epoch: 4, batch: 83, loss: 2.51367, f1 score: 0.64195, speed: 1.63 step/s
global step 600, epoch: 4, batch: 93, loss: 2.63432, f1 score: 0.64232, speed: 1.66 step/s
eval loss: 3.64839, f1 score: 0.42212, precison: 0.59919, recall: 0.36252
eval loss: 3.44445, f1 score: 0.30499, precison: 0.31580, recall: 0.33516
global step 610, epoch: 4, batch: 103, loss: 2.95672, f1 score: 0.61929, speed: 0.55 step/s
global step 620, epoch: 4, batch: 113, loss: 2.71923, f1 score: 0.63795, speed: 1.76 step/s
global step 630, epoch: 4, batch: 123, loss: 3.11496, f1 score: 0.63265, speed: 1.78 step/s
global step 640, epoch: 4, batch: 133, loss: 2.54275, f1 score: 0.62257, speed: 1.72 step/s
eval loss: 3.65717, f1 score: 0.42960, precison: 0.56714, recall: 0.38490
eval loss: 3.42747, f1 score: 0.366

[2022-12-29 11:29:55,973] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:30:07,910] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:30:07,913] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 650, epoch: 4, batch: 143, loss: 2.74396, f1 score: 0.59712, speed: 0.33 step/s
global step 660, epoch: 4, batch: 153, loss: 2.81148, f1 score: 0.63334, speed: 1.67 step/s
global step 670, epoch: 4, batch: 163, loss: 2.40235, f1 score: 0.62504, speed: 1.74 step/s
global step 680, epoch: 5, batch: 4, loss: 2.36740, f1 score: 0.63585, speed: 1.80 step/s
eval loss: 3.70600, f1 score: 0.44188, precison: 0.59140, recall: 0.38229
eval loss: 3.40700, f1 score: 0.35074, precison: 0.40971, recall: 0.36437
global step 690, epoch: 5, batch: 14, loss: 2.56684, f1 score: 0.69799, speed: 0.57 step/s
global step 700, epoch: 5, batch: 24, loss: 2.60512, f1 score: 0.69892, speed: 1.53 step/s
global step 710, epoch: 5, batch: 34, loss: 2.28034, f1 score: 0.70241, speed: 1.67 step/s
global step 720, epoch: 5, batch: 44, loss: 2.42489, f1 score: 0.70359, speed: 1.59 step/s
eval loss: 3.72019, f1 score: 0.44775, precison: 0.58497, recall: 0.40264
eval loss: 3.52435, f1 score: 0.34283

[2022-12-29 11:33:08,018] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 11:33:19,888] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 11:33:19,892] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 850, epoch: 6, batch: 5, loss: 2.09816, f1 score: 0.72687, speed: 0.34 step/s
global step 860, epoch: 6, batch: 15, loss: 2.01062, f1 score: 0.73261, speed: 1.67 step/s
global step 870, epoch: 6, batch: 25, loss: 2.11164, f1 score: 0.74504, speed: 1.63 step/s
global step 880, epoch: 6, batch: 35, loss: 2.13907, f1 score: 0.74744, speed: 1.54 step/s
eval loss: 3.94304, f1 score: 0.45414, precison: 0.54619, recall: 0.41846
eval loss: 3.76684, f1 score: 0.37933, precison: 0.35281, recall: 0.43824


[2022-12-29 11:33:55,814] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:34:07,690] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:34:07,694] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 890, epoch: 6, batch: 45, loss: 2.37319, f1 score: 0.75855, speed: 0.34 step/s
global step 900, epoch: 6, batch: 55, loss: 2.15354, f1 score: 0.76278, speed: 1.64 step/s
global step 910, epoch: 6, batch: 65, loss: 1.94582, f1 score: 0.75853, speed: 1.53 step/s
global step 920, epoch: 6, batch: 75, loss: 2.08132, f1 score: 0.75755, speed: 1.66 step/s
eval loss: 3.98186, f1 score: 0.43696, precison: 0.55007, recall: 0.39378
eval loss: 3.67943, f1 score: 0.35867, precison: 0.34253, recall: 0.41830
global step 930, epoch: 6, batch: 85, loss: 2.34456, f1 score: 0.75767, speed: 0.56 step/s
global step 940, epoch: 6, batch: 95, loss: 2.20435, f1 score: 0.75558, speed: 1.53 step/s
global step 950, epoch: 6, batch: 105, loss: 1.70536, f1 score: 0.76145, speed: 1.54 step/s
global step 960, epoch: 6, batch: 115, loss: 2.10169, f1 score: 0.75604, speed: 1.68 step/s
eval loss: 4.07300, f1 score: 0.44969, precison: 0.55665, recall: 0.40540
eval loss: 3.93687, f1 score: 0.37695

[2022-12-29 11:36:33,703] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:36:45,587] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:36:45,591] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 1050, epoch: 7, batch: 36, loss: 1.93574, f1 score: 0.81284, speed: 0.33 step/s
global step 1060, epoch: 7, batch: 46, loss: 1.72673, f1 score: 0.82815, speed: 1.69 step/s
global step 1070, epoch: 7, batch: 56, loss: 1.96732, f1 score: 0.82687, speed: 1.67 step/s
global step 1080, epoch: 7, batch: 66, loss: 1.88227, f1 score: 0.82175, speed: 1.65 step/s
eval loss: 4.30330, f1 score: 0.46673, precison: 0.55485, recall: 0.43791
eval loss: 4.12265, f1 score: 0.36877, precison: 0.32435, recall: 0.49042


[2022-12-29 11:37:21,683] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:37:33,544] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:37:33,548] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 1090, epoch: 7, batch: 76, loss: 1.81379, f1 score: 0.80404, speed: 0.35 step/s
global step 1100, epoch: 7, batch: 86, loss: 1.79007, f1 score: 0.81880, speed: 1.59 step/s
global step 1110, epoch: 7, batch: 96, loss: 1.78335, f1 score: 0.82127, speed: 1.72 step/s
global step 1120, epoch: 7, batch: 106, loss: 1.74477, f1 score: 0.81748, speed: 1.72 step/s
eval loss: 4.35737, f1 score: 0.46576, precison: 0.55348, recall: 0.43966
eval loss: 4.25647, f1 score: 0.38368, precison: 0.35104, recall: 0.45909
global step 1130, epoch: 7, batch: 116, loss: 1.92818, f1 score: 0.83927, speed: 0.57 step/s
global step 1140, epoch: 7, batch: 126, loss: 1.94844, f1 score: 0.82442, speed: 1.71 step/s
global step 1150, epoch: 7, batch: 136, loss: 1.82009, f1 score: 0.82484, speed: 1.63 step/s
global step 1160, epoch: 7, batch: 146, loss: 1.76436, f1 score: 0.81649, speed: 1.62 step/s
eval loss: 4.44882, f1 score: 0.44971, precison: 0.54753, recall: 0.42220
eval loss: 4.13499, f1 sco

[2022-12-29 11:45:24,100] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:45:36,013] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:45:36,016] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 1610, epoch: 10, batch: 89, loss: 0.95544, f1 score: 0.93246, speed: 0.33 step/s
global step 1620, epoch: 10, batch: 99, loss: 0.80955, f1 score: 0.92934, speed: 1.54 step/s
global step 1630, epoch: 10, batch: 109, loss: 1.19103, f1 score: 0.92790, speed: 1.69 step/s
global step 1640, epoch: 10, batch: 119, loss: 1.11744, f1 score: 0.92783, speed: 1.68 step/s
eval loss: 5.18518, f1 score: 0.46724, precison: 0.54385, recall: 0.43090
eval loss: 5.05233, f1 score: 0.33715, precison: 0.29596, recall: 0.41985
global step 1650, epoch: 10, batch: 129, loss: 1.15165, f1 score: 0.92492, speed: 0.56 step/s
global step 1660, epoch: 10, batch: 139, loss: 0.78620, f1 score: 0.93335, speed: 1.56 step/s
global step 1670, epoch: 10, batch: 149, loss: 1.08533, f1 score: 0.93662, speed: 1.66 step/s
global step 1680, epoch: 10, batch: 159, loss: 1.02232, f1 score: 0.93319, speed: 1.65 step/s
eval loss: 5.31440, f1 score: 0.45747, precison: 0.52270, recall: 0.43690
eval loss: 5.0767

[2022-12-29 11:47:24,782] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:47:36,634] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:47:36,639] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 1730, epoch: 11, batch: 40, loss: 0.93112, f1 score: 0.95230, speed: 0.33 step/s
global step 1740, epoch: 11, batch: 50, loss: 1.06776, f1 score: 0.95014, speed: 1.51 step/s
global step 1750, epoch: 11, batch: 60, loss: 0.95632, f1 score: 0.95348, speed: 1.57 step/s
global step 1760, epoch: 11, batch: 70, loss: 0.75492, f1 score: 0.95209, speed: 1.58 step/s
eval loss: 5.46707, f1 score: 0.47733, precison: 0.53434, recall: 0.45592
eval loss: 5.29777, f1 score: 0.34214, precison: 0.28484, recall: 0.46253


[2022-12-29 11:48:14,158] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:48:26,202] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:48:26,205] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 1770, epoch: 11, batch: 80, loss: 0.72860, f1 score: 0.96174, speed: 0.34 step/s
global step 1780, epoch: 11, batch: 90, loss: 0.71436, f1 score: 0.95529, speed: 1.71 step/s
global step 1790, epoch: 11, batch: 100, loss: 0.96701, f1 score: 0.95019, speed: 1.64 step/s
global step 1800, epoch: 11, batch: 110, loss: 1.05396, f1 score: 0.94980, speed: 1.72 step/s
eval loss: 5.53526, f1 score: 0.46358, precison: 0.52260, recall: 0.44914
eval loss: 5.30072, f1 score: 0.35648, precison: 0.29662, recall: 0.46644
global step 1810, epoch: 11, batch: 120, loss: 0.90577, f1 score: 0.93361, speed: 0.54 step/s
global step 1820, epoch: 11, batch: 130, loss: 0.92675, f1 score: 0.93351, speed: 1.54 step/s
global step 1830, epoch: 11, batch: 140, loss: 0.84533, f1 score: 0.94376, speed: 1.69 step/s
global step 1840, epoch: 11, batch: 150, loss: 0.99986, f1 score: 0.94068, speed: 1.59 step/s
eval loss: 5.67292, f1 score: 0.45862, precison: 0.52236, recall: 0.44063
eval loss: 5.7896

[2022-12-29 11:52:03,372] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:52:15,281] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:52:15,284] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 2010, epoch: 12, batch: 151, loss: 0.68306, f1 score: 0.96516, speed: 0.33 step/s
global step 2020, epoch: 12, batch: 161, loss: 0.66456, f1 score: 0.95941, speed: 1.64 step/s
global step 2030, epoch: 13, batch: 2, loss: 0.56720, f1 score: 0.96128, speed: 1.70 step/s
global step 2040, epoch: 13, batch: 12, loss: 0.67984, f1 score: 0.96397, speed: 1.74 step/s
eval loss: 5.95194, f1 score: 0.47661, precison: 0.51294, recall: 0.46535
eval loss: 5.71082, f1 score: 0.31050, precison: 0.26142, recall: 0.41428
global step 2050, epoch: 13, batch: 22, loss: 0.65817, f1 score: 0.97747, speed: 0.55 step/s
global step 2060, epoch: 13, batch: 32, loss: 0.61720, f1 score: 0.97286, speed: 1.60 step/s
global step 2070, epoch: 13, batch: 42, loss: 0.54500, f1 score: 0.97631, speed: 1.57 step/s
global step 2080, epoch: 13, batch: 52, loss: 0.35696, f1 score: 0.97668, speed: 1.63 step/s
eval loss: 5.96524, f1 score: 0.48327, precison: 0.52876, recall: 0.46728
eval loss: 5.78968, f1

[2022-12-29 11:53:27,997] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:53:40,098] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:53:40,102] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 2090, epoch: 13, batch: 62, loss: 0.55511, f1 score: 0.97676, speed: 0.33 step/s
global step 2100, epoch: 13, batch: 72, loss: 0.45450, f1 score: 0.97401, speed: 1.55 step/s
global step 2110, epoch: 13, batch: 82, loss: 0.32420, f1 score: 0.97072, speed: 1.44 step/s
global step 2120, epoch: 13, batch: 92, loss: 0.64047, f1 score: 0.97172, speed: 1.67 step/s
eval loss: 5.98810, f1 score: 0.47577, precison: 0.51675, recall: 0.45952
eval loss: 5.68259, f1 score: 0.35168, precison: 0.29640, recall: 0.47164
global step 2130, epoch: 13, batch: 102, loss: 0.60634, f1 score: 0.97172, speed: 0.59 step/s
global step 2140, epoch: 13, batch: 112, loss: 0.40812, f1 score: 0.96856, speed: 1.69 step/s
global step 2150, epoch: 13, batch: 122, loss: 0.75256, f1 score: 0.96762, speed: 1.66 step/s
global step 2160, epoch: 13, batch: 132, loss: 0.56520, f1 score: 0.96900, speed: 1.67 step/s
eval loss: 6.20888, f1 score: 0.45975, precison: 0.52850, recall: 0.44908
eval loss: 5.90526,

[2022-12-29 11:56:41,561] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 11:56:53,434] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 11:56:53,438] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 2290, epoch: 14, batch: 93, loss: 0.31857, f1 score: 0.97364, speed: 0.33 step/s
global step 2300, epoch: 14, batch: 103, loss: 0.75649, f1 score: 0.97637, speed: 1.70 step/s
global step 2310, epoch: 14, batch: 113, loss: 0.21889, f1 score: 0.98061, speed: 1.59 step/s
global step 2320, epoch: 14, batch: 123, loss: 0.63312, f1 score: 0.97709, speed: 1.60 step/s
eval loss: 6.34990, f1 score: 0.48252, precison: 0.50874, recall: 0.47856
eval loss: 6.15652, f1 score: 0.35332, precison: 0.32605, recall: 0.48284
global step 2330, epoch: 14, batch: 133, loss: 0.34404, f1 score: 0.98643, speed: 0.56 step/s
global step 2340, epoch: 14, batch: 143, loss: 0.40079, f1 score: 0.97711, speed: 1.74 step/s
global step 2350, epoch: 14, batch: 153, loss: 0.37670, f1 score: 0.97682, speed: 1.65 step/s
global step 2360, epoch: 14, batch: 163, loss: 0.59220, f1 score: 0.97439, speed: 1.69 step/s
eval loss: 6.24704, f1 score: 0.46625, precison: 0.52541, recall: 0.43655
eval loss: 5.723

[2022-12-29 11:59:54,559] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu/config.json
[2022-12-29 12:00:19,202] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu/tokenizer_config.json
[2022-12-29 12:00:19,221] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu/special_tokens_map.json


global step 2490, epoch: 15, batch: 124, loss: 0.26813, f1 score: 0.99123, speed: 0.23 step/s
global step 2500, epoch: 15, batch: 134, loss: 0.37820, f1 score: 0.98299, speed: 1.78 step/s
global step 2510, epoch: 15, batch: 144, loss: 0.29995, f1 score: 0.98606, speed: 1.59 step/s
global step 2520, epoch: 15, batch: 154, loss: 0.34441, f1 score: 0.98491, speed: 1.72 step/s
eval loss: 6.50462, f1 score: 0.48911, precison: 0.53623, recall: 0.47334
eval loss: 6.28880, f1 score: 0.34251, precison: 0.28111, recall: 0.47931


[2022-12-29 12:00:54,927] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 12:01:09,544] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 12:01:09,548] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 2530, epoch: 15, batch: 164, loss: 0.47579, f1 score: 0.98157, speed: 0.31 step/s
global step 2540, epoch: 16, batch: 5, loss: 0.43891, f1 score: 0.98233, speed: 1.66 step/s
global step 2550, epoch: 16, batch: 15, loss: 0.34689, f1 score: 0.98204, speed: 1.66 step/s
global step 2560, epoch: 16, batch: 25, loss: 0.42079, f1 score: 0.98301, speed: 1.64 step/s
eval loss: 6.65712, f1 score: 0.48505, precison: 0.51982, recall: 0.48329
eval loss: 6.39782, f1 score: 0.34724, precison: 0.28115, recall: 0.51078
global step 2570, epoch: 16, batch: 35, loss: 0.25801, f1 score: 0.98461, speed: 0.55 step/s
global step 2580, epoch: 16, batch: 45, loss: 0.24862, f1 score: 0.98766, speed: 1.62 step/s
global step 2590, epoch: 16, batch: 55, loss: 0.21864, f1 score: 0.98727, speed: 1.64 step/s
global step 2600, epoch: 16, batch: 65, loss: 0.50354, f1 score: 0.98738, speed: 1.53 step/s
eval loss: 6.60937, f1 score: 0.48753, precison: 0.51832, recall: 0.48009
eval loss: 6.32857, f1 

[2022-12-29 12:08:24,610] [    INFO] - Configuration saved in ernie2.0_ckpt/config.json
[2022-12-29 12:08:37,585] [    INFO] - tokenizer config file saved in ernie2.0_ckpt/tokenizer_config.json
[2022-12-29 12:08:37,588] [    INFO] - Special tokens file saved in ernie2.0_ckpt/special_tokens_map.json


global step 3010, epoch: 18, batch: 137, loss: 0.32662, f1 score: 0.98742, speed: 0.33 step/s
global step 3020, epoch: 18, batch: 147, loss: 0.40685, f1 score: 0.98560, speed: 1.63 step/s
global step 3030, epoch: 18, batch: 157, loss: 0.26224, f1 score: 0.98320, speed: 1.70 step/s
global step 3040, epoch: 18, batch: 167, loss: 0.26660, f1 score: 0.98207, speed: 1.59 step/s
eval loss: 7.03179, f1 score: 0.48569, precison: 0.54195, recall: 0.46862
eval loss: 6.81358, f1 score: 0.31031, precison: 0.25938, recall: 0.42364
global step 3050, epoch: 19, batch: 8, loss: 0.18939, f1 score: 0.99327, speed: 0.57 step/s
global step 3060, epoch: 19, batch: 18, loss: 0.23990, f1 score: 0.99162, speed: 1.80 step/s
global step 3070, epoch: 19, batch: 28, loss: 0.24451, f1 score: 0.99055, speed: 1.65 step/s
global step 3080, epoch: 19, batch: 38, loss: 0.23496, f1 score: 0.99075, speed: 1.60 step/s
eval loss: 7.21831, f1 score: 0.48455, precison: 0.52628, recall: 0.48090
eval loss: 6.91323, 

In [17]:
# 模型加载
# 加载最后的模型参数
model.set_dict(paddle.load('ernie2.0_ckpt/model_state.pdparams'))

# 加载之前训练好的模型参数
# model.set_dict(paddle.load('/home/aistudio/work/model_state.pdparams'))

# 模型验证
print("ERNIE 3.0 在20分类验证集的最佳表现：", end= " ")
results1 = evaluate(model, criterion, metric, valid_data_loader, label_vocab)
print("ERNIE 2.0 在20分类知乎验证集的最佳表现：", end= " ")
results = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab)

ERNIE 3.0 在20分类验证集的最佳表现： eval loss: 7.01792, f1 score: 0.49189, precison: 0.52654, recall: 0.48107
ERNIE 2.0 在20分类知乎验证集的最佳表现： eval loss: 6.95169, f1 score: 0.30715, precison: 0.26447, recall: 0.40759


In [None]:
#接下来调低学习率继续训练

In [18]:
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters(), weight_decay=0.01)
epochs = 7 # 训练轮次
ckpt_dir = "ernie2.0_ckpt_2" # 训练过程中保存模型参数的文件夹
save_dir2 = "ernie2.0_ckpt_zhihu_2"

global_step = 0  # 迭代次数
tic_train = time.time()
best_f1_score = 0.48538
best_f1_score2 = 0.33237
# 模型训练
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率、f1分数
        logits = model(input_ids, token_type_ids)
        
        loss = criterion(labels, logits)
        loss = loss.mean()
        probs = logits
        #probs = F.sigmoid(logits)
        metric.update(probs, labels)
        
        # auc, f1_score, _,  _= metric.accumulate()
        f1_score, _,  _= metric.accumulate()


        # 每迭代100次，打印损失函数值、准确率、f1分数、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        
        #每迭代40次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 40 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, valid_data_loader, label_vocab, if_return_results=False)
            eval_f1_score2 = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)
            if eval_f1_score2 > best_f1_score2:
                best_f1_score2 = eval_f1_score2
                model.save_pretrained(save_dir2)
                tokenizer.save_pretrained(save_dir2)

global step 10, epoch: 1, batch: 10, loss: 0.21115, f1 score: 0.98385, speed: 1.66 step/s
global step 20, epoch: 1, batch: 20, loss: 0.16292, f1 score: 0.98723, speed: 1.73 step/s
global step 30, epoch: 1, batch: 30, loss: 0.21903, f1 score: 0.98882, speed: 1.63 step/s
global step 40, epoch: 1, batch: 40, loss: 0.31735, f1 score: 0.98897, speed: 1.77 step/s
eval loss: 7.06932, f1 score: 0.48287, precison: 0.52517, recall: 0.46811
eval loss: 6.89580, f1 score: 0.33898, precison: 0.28671, recall: 0.45758


[2022-12-29 12:28:17,567] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu_2/config.json
[2022-12-29 12:28:21,176] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu_2/tokenizer_config.json
[2022-12-29 12:28:21,180] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu_2/special_tokens_map.json


global step 50, epoch: 1, batch: 50, loss: 0.26855, f1 score: 0.98274, speed: 0.47 step/s
global step 60, epoch: 1, batch: 60, loss: 0.36780, f1 score: 0.98395, speed: 1.62 step/s
global step 70, epoch: 1, batch: 70, loss: 0.14001, f1 score: 0.98685, speed: 1.58 step/s
global step 80, epoch: 1, batch: 80, loss: 0.42516, f1 score: 0.98601, speed: 1.70 step/s
eval loss: 7.15546, f1 score: 0.47923, precison: 0.53627, recall: 0.45987
eval loss: 7.08550, f1 score: 0.36831, precison: 0.31563, recall: 0.48252


[2022-12-29 12:28:57,495] [    INFO] - Configuration saved in ernie2.0_ckpt_zhihu_2/config.json
[2022-12-29 12:29:10,478] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_zhihu_2/tokenizer_config.json
[2022-12-29 12:29:10,481] [    INFO] - Special tokens file saved in ernie2.0_ckpt_zhihu_2/special_tokens_map.json


global step 90, epoch: 1, batch: 90, loss: 0.24385, f1 score: 0.98606, speed: 0.33 step/s
global step 100, epoch: 1, batch: 100, loss: 0.15140, f1 score: 0.98754, speed: 1.85 step/s
global step 110, epoch: 1, batch: 110, loss: 0.11361, f1 score: 0.98634, speed: 1.65 step/s
global step 120, epoch: 1, batch: 120, loss: 0.16482, f1 score: 0.98783, speed: 1.78 step/s
eval loss: 7.11093, f1 score: 0.47989, precison: 0.52526, recall: 0.45943
eval loss: 6.98985, f1 score: 0.34697, precison: 0.33055, recall: 0.44697
global step 130, epoch: 1, batch: 130, loss: 0.23779, f1 score: 0.98184, speed: 0.55 step/s
global step 140, epoch: 1, batch: 140, loss: 0.27784, f1 score: 0.98685, speed: 1.68 step/s
global step 150, epoch: 1, batch: 150, loss: 0.15618, f1 score: 0.98787, speed: 1.78 step/s
global step 160, epoch: 1, batch: 160, loss: 0.22180, f1 score: 0.98697, speed: 1.57 step/s
eval loss: 7.04342, f1 score: 0.47451, precison: 0.53075, recall: 0.45275
eval loss: 6.96310, f1 score: 0.3

[2022-12-29 12:32:45,032] [    INFO] - Configuration saved in ernie2.0_ckpt_2/config.json
[2022-12-29 12:32:48,565] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_2/tokenizer_config.json
[2022-12-29 12:32:48,570] [    INFO] - Special tokens file saved in ernie2.0_ckpt_2/special_tokens_map.json


global step 330, epoch: 2, batch: 161, loss: 0.42477, f1 score: 0.98692, speed: 0.47 step/s
global step 340, epoch: 3, batch: 2, loss: 0.17821, f1 score: 0.98679, speed: 1.51 step/s
global step 350, epoch: 3, batch: 12, loss: 0.27965, f1 score: 0.98829, speed: 1.68 step/s
global step 360, epoch: 3, batch: 22, loss: 0.14351, f1 score: 0.98821, speed: 1.57 step/s
eval loss: 7.27132, f1 score: 0.48481, precison: 0.53005, recall: 0.46868
eval loss: 7.10334, f1 score: 0.33454, precison: 0.32406, recall: 0.42970
global step 370, epoch: 3, batch: 32, loss: 0.23289, f1 score: 0.99221, speed: 0.56 step/s
global step 380, epoch: 3, batch: 42, loss: 0.13566, f1 score: 0.99182, speed: 1.66 step/s
global step 390, epoch: 3, batch: 52, loss: 0.31035, f1 score: 0.99115, speed: 1.66 step/s
global step 400, epoch: 3, batch: 62, loss: 0.31810, f1 score: 0.98940, speed: 1.67 step/s
eval loss: 7.33294, f1 score: 0.47639, precison: 0.53946, recall: 0.45930
eval loss: 7.20191, f1 score: 0.33663, 

[2022-12-29 12:42:27,476] [    INFO] - Configuration saved in ernie2.0_ckpt_2/config.json
[2022-12-29 12:42:40,149] [    INFO] - tokenizer config file saved in ernie2.0_ckpt_2/tokenizer_config.json
[2022-12-29 12:42:40,153] [    INFO] - Special tokens file saved in ernie2.0_ckpt_2/special_tokens_map.json


global step 970, epoch: 6, batch: 125, loss: 0.06501, f1 score: 0.99287, speed: 0.33 step/s
global step 980, epoch: 6, batch: 135, loss: 0.19972, f1 score: 0.99013, speed: 1.66 step/s
global step 990, epoch: 6, batch: 145, loss: 0.11650, f1 score: 0.99009, speed: 1.61 step/s
global step 1000, epoch: 6, batch: 155, loss: 0.07342, f1 score: 0.99112, speed: 1.58 step/s
eval loss: 7.70304, f1 score: 0.48682, precison: 0.53351, recall: 0.47065
eval loss: 7.69689, f1 score: 0.35761, precison: 0.33537, recall: 0.46318
global step 1010, epoch: 6, batch: 165, loss: 0.04741, f1 score: 0.98182, speed: 0.55 step/s
global step 1020, epoch: 7, batch: 6, loss: 0.05366, f1 score: 0.98776, speed: 1.67 step/s
global step 1030, epoch: 7, batch: 16, loss: 0.10682, f1 score: 0.99035, speed: 1.73 step/s
global step 1040, epoch: 7, batch: 26, loss: 0.11000, f1 score: 0.99068, speed: 1.58 step/s
eval loss: 7.67372, f1 score: 0.48071, precison: 0.53462, recall: 0.46123
eval loss: 7.60245, f1 score: 

In [16]:
# 模型加载
# 加载验证集上效果最好的模型参数
model.set_dict(paddle.load('ernie2.0_ckpt_2/model_state.pdparams'))

# 加载之前训练好的模型参数
# model.set_dict(paddle.load('/home/aistudio/work/model_state.pdparams'))

# 模型验证
print("ERNIE 3.0 在20分类验证集的最佳表现：", end= " ")
results1 = evaluate(model, criterion, metric, valid_data_loader, label_vocab)
print("ERNIE 2.0 在20分类知乎验证集的最佳表现：", end= " ")
results = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab)

ERNIE 3.0 在20分类验证集的最佳表现： eval loss: 7.59323, f1 score: 0.49040, precison: 0.53397, recall: 0.47277
ERNIE 2.0 在20分类知乎验证集的最佳表现： eval loss: 7.68647, f1 score: 0.35710, precison: 0.31020, recall: 0.47708


In [17]:
def result2tsv(result1,filepath):
    validData = pd.read_csv(filepath,sep='\t')
    dictvalidT =validData.to_dict("list")

    validPred = {}
    validPred["Argument ID"] = dictvalidT["Argument ID"]
    validPred["sentence"] = dictvalidT["sentence"]

    for x in label_vocab:
        validPred[x] = []
    
    for x in range(len(results1)):
        types = results1[x].split(",")
        if types == ['']:
            for y in label_vocab:
                validPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    validPred[z].append(1)
                else:
                    validPred[z].append(0)
    validData = pd.read_csv(filepath,sep='\t')
    for x in label_vocab:
        for y in range(len(validData[x])):
            validData[x].iloc[y] = validPred[x][y]
    
    validData.drop(columns=["sentence"],inplace=True)
    return validData

In [18]:
%cd /home/aistudio/data

/home/aistudio/data


In [19]:
import pandas as pd
valid = result2tsv(results1,"validation.tsv")
valid.to_csv('validERNIE2.0.tsv',columns=valid.columns.tolist(),
            sep='\t',
            index=False)

validzhihu = result2tsv(results,"zhihu_validation.tsv")
validzhihu.to_csv('validERNIE2.0zhihu.tsv',columns=validzhihu.columns.tolist(),
            sep='\t',
            index=False)

In [57]:
# 预测函数，对测试集结果进行预测
def predict(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    results = []
    for batch in data_loader:
        input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
        logits = model(input_ids, token_type_ids)
        probs = logits
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))
    if if_return_results:
        return results
    else:
        return f1_score


In [68]:
def testOutput(results2):
    testData = pd.read_csv("test.tsv",sep='\t')
    dicttestT =testData.to_dict("list")

    testPred = {}
    testPred["Argument ID"] = dicttestT["Argument ID"]
    testPred["sentence"] = dicttestT["sentence"]

    for x in label_vocab:
        testPred[x] = []

    for x in range(len(results2)):
        types = results2[x].split(",")
        if types == ['']:
            for y in label_vocab:
                testPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    testPred[z].append(1)
                else:
                    testPred[z].append(0)
    testPredD = pd.DataFrame.from_dict(testPred)
    testPredD.drop(columns=["sentence"],inplace=True)

    return testPredD

In [58]:
results2 = predict(model, criterion, metric, test_data_loader, label_vocab)


In [69]:
test = testOutput(results2)
test.to_csv('testErnie2.0.tsv',columns=test.columns.tolist(),
            sep='\t',
            index=False)