In [1]:
import os
import paddle
import paddlenlp
import paddle.nn as nn

In [2]:
# 自定义数据集
import re

from paddlenlp.datasets import load_dataset

# 清洗无效字符
def clean_text(text):
    text = text.replace("\r", "").replace("\n", "")
    text = re.sub(r"\\n\n", ".", text)
    return text

# 定义读取数据集函数
def read_custom_data(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        labels = [float(d) for d in data[2:]]
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels": labels}
    f.close()

def read_custom_data_test(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels":[]}
    f.close()

In [28]:
%cd /home/aistudio/data

/home/aistudio/data


In [29]:
# load_dataset()创建数据集
# lazy=False，数据集返回为MapDataset类型
# 对训练集和验证集进行预处理
train_ds = load_dataset(read_custom_data, filepath='train.tsv', lazy=False) 
valid_ds = load_dataset(read_custom_data, filepath='validation.tsv', lazy=False)
validZhihu_ds = load_dataset(read_custom_data, filepath='zhihu_validation.tsv', lazy=False)
test_ds = load_dataset(read_custom_data_test, filepath='test.tsv', lazy=False) 
test2_ds = load_dataset(read_custom_data_test, filepath='test2.tsv', lazy=False) 

In [5]:
# 加载中文ERNIE 3.0预训练模型和分词器
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from paddlenlp.transformers import BertTokenizer,BertPretrainedModel
from paddlenlp.transformers import BertTokenizer,BertModel

model_name = "bert-large-uncased"   # ERNIE2.0 模型
num_classes = 20  # 20分类任务
#model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[2023-01-17 18:25:45,619] [    INFO] - We are using <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> to load 'bert-large-uncased'.
[2023-01-17 18:25:45,622] [    INFO] - Downloading https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-uncased-vocab.txt and saved to /home/aistudio/.paddlenlp/models/bert-large-uncased
[2023-01-17 18:25:45,625] [    INFO] - Downloading bert-large-uncased-vocab.txt from https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-uncased-vocab.txt
100%|██████████| 226k/226k [00:00<00:00, 2.39MB/s]
[2023-01-17 18:25:45,903] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/bert-large-uncased/tokenizer_config.json
[2023-01-17 18:25:45,906] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/bert-large-uncased/special_tokens_map.json


In [30]:
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length):
    result = tokenizer(text=examples["sentence"], max_seq_len=max_seq_length)
    result["labels"] = examples["labels"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)
train_ds = train_ds.map(trans_func)
valid_ds = valid_ds.map(trans_func)
validZhihu_ds = validZhihu_ds.map(trans_func)
test_ds = test_ds.map(trans_func)
test2_ds = test2_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
valid_batch_sampler = BatchSampler(valid_ds, batch_size=16, shuffle=False)
validZhihu_batch_sampler = BatchSampler(validZhihu_ds, batch_size=16, shuffle=False)
test_batch_sampler = BatchSampler(test_ds, batch_size=16, shuffle=False)
test2_batch_sampler = BatchSampler(test2_ds, batch_size=16, shuffle=False)

train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
valid_data_loader = DataLoader(dataset=valid_ds, batch_sampler=valid_batch_sampler, collate_fn=collate_fn)
validZhihu_data_loader = DataLoader(dataset=validZhihu_ds, batch_sampler=validZhihu_batch_sampler, collate_fn=collate_fn)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)
test2_data_loader = DataLoader(dataset=test2_ds, batch_sampler=test2_batch_sampler, collate_fn=collate_fn)

In [7]:
#参数设置
class Config:
    def __init__(self):
        super(Config, self).__init__()

        self.SEED = 102
        self.MODEL_PATH = 'bert-large-uncased'
        self.NUM_LABELS = 20

        # data
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH)
        self.MAX_LENGTH = 128
        self.BATCH_SIZE = 16

        # model
        self.FULL_FINETUNING = True
        self.LR = 3e-5
        self.OPTIMIZER = 'AdamW'
        self.N_VALIDATE_DUR_TRAIN = 3
        self.N_WARMUP = 0
        self.SAVE_BEST_ONLY = True
        self.EPOCHS = 20
        self.USE_FGM = False
        # self.LOSS_TYPE = paddle.nn.BCEWithLogitsLoss()
        # self.HIDDEN_DROPOUT_PROB = 0.2
        # self.HIDDEN_SIZE = 1024

config = Config()
import random
import numpy as np
seed = config.SEED
paddle.seed(seed)
random.seed(seed)
np.random.seed(seed)

[2023-01-17 18:25:45,931] [    INFO] - We are using <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> to load 'bert-large-uncased'.
[2023-01-17 18:25:45,933] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/bert-large-uncased/bert-large-uncased-vocab.txt
[2023-01-17 18:25:45,954] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/bert-large-uncased/tokenizer_config.json
[2023-01-17 18:25:45,957] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/bert-large-uncased/special_tokens_map.json


In [8]:
from paddle.nn import functional as F

In [9]:
class Bert_LSTM(nn.Layer):
  def __init__(self):
    super(Bert_LSTM,self).__init__()
    self.num_labels=20
    self.dropout=nn.Dropout(0.1)
    self.bert=BertModel.from_pretrained(config.MODEL_PATH)
    for param in self.bert.parameters():
      param.requires_grad=True
    self.classifier=nn.Linear(1024,self.num_labels)
    #self.crf=CRF(num_labels,batch_first=True)

    self.bilstm=nn.LSTM(
        input_size=1024, 
        hidden_size=512, 
        time_major=False,
        num_layers=2,
        # dropout=0.3,  
        direction="bidirect")

  def forward(self,input_ids, attention_mask):

    output=self.bert(input_ids=input_ids,attention_mask=attention_mask)
    # pooler_output=output.pooler_output
    # last_hidden_state=output.last_hidden_state
    last_hidden_state=output[0]

    
    last_hidden_state=self.dropout(last_hidden_state)
    lstm_output,(hn,cn)=self.bilstm(last_hidden_state)
    lstm_output=self.dropout(lstm_output)
    out =lstm_output[:,-1,:]   #只要序列中最后一个token对应的输出，（因为lstm会记录前边token的信息）

    # 得到判别值
    logits=self.classifier(out)
    # logits=self.classifier(last_hidden_state)
    #log_probs = F.log_softmax(logits,dim=-1)
    return logits

In [10]:
model = Bert_LSTM()

[2023-01-17 18:25:45,981] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/bert-large-uncased.pdparams and saved to /home/aistudio/.paddlenlp/models/bert-large-uncased
[2023-01-17 18:25:45,984] [    INFO] - Downloading bert-large-uncased.pdparams from https://bj.bcebos.com/paddlenlp/models/transformers/bert-large-uncased.pdparams
100%|██████████| 2.06G/2.06G [00:56<00:00, 39.2MB/s]
W0117 18:26:42.513813   534 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0117 18:26:42.518093   534 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
[2023-01-17 18:26:57,025] [    INFO] - Weights from pretrained model not used in BertModel: ['cls.predictions.decoder_weight', 'cls.predictions.decoder_bias', 'cls.predictions.transform.weight', 'cls.predictions.transform.bias', 'cls.predictions.layer_norm.weight', 'cls.predictions.layer_norm.bias', 'cls.seq_relationship.weight', 'cls.seq_relatio

In [11]:
model

Bert_LSTM(
  (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, sparse=False)
      (position_embeddings): Embedding(512, 1024, sparse=False)
      (token_type_embeddings): Embedding(2, 1024, sparse=False)
      (layer_norm): LayerNorm(normalized_shape=[1024], epsilon=1e-12)
      (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)
    )
    (encoder): TransformerEncoder(
      (layers): LayerList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiHeadAttention(
            (q_proj): Linear(in_features=1024, out_features=1024, dtype=float32)
            (k_proj): Linear(in_features=1024, out_features=1024, dtype=float32)
            (v_proj): Linear(in_features=1024, out_features=1024, dtype=float32)
            (out_proj): Linear(in_features=1024, out_features=1024, dtype=float32)
          )
          (linear1): Linear(in_features=1024, out_features=

In [12]:
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from paddle.metric import Metric

# 自定义MultiLabelReport评价指标
class MultiLabelReport(Metric):
    """
    AUC and F1 Score for multi-label text classification task.
    """

    def __init__(self, name='MultiLabelReport', average='macro'):
        super(MultiLabelReport, self).__init__()
        self.average = average
        self._name = name
        self.reset()

    def f1_score(self, y_prob):
        '''
        Returns the f1 score by searching the best threshhold
        '''
        thresholds =0
        self.y_pred = y_prob > thresholds
        score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        return score, precison, recall

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.y_prob = None
        self.y_true = None

    def update(self, probs, labels):
        if self.y_prob is not None:
            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
        else:
            self.y_prob = probs.numpy()
        if self.y_true is not None:
            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
        else:
            self.y_true = labels.numpy()

    def accumulate(self):
        f1_score, precison, recall = self.f1_score(y_prob=self.y_prob)
        return f1_score, precison, recall
    

    def name(self):
        """
        Returns metric name
        """
        return self._name

In [13]:
import numpy as np
def multilabel_categorical_crossentropy(y_pred, y_true):
    """多标签分类的交叉熵
    说明：y_true和y_pred的shape一致，y_true的元素非0即1，
         1表示对应的类为目标类，0表示对应的类为非目标类。
    警告：请保证y_pred的值域是全体实数，换言之一般情况下y_pred
         不用加激活函数，尤其是不能加sigmoid或者softmax！预测
         阶段则输出y_pred大于0的类。如有疑问，请仔细阅读并理解
         本文。
    假如类别总数为10
    label ：[0,1,0,0,0,0,0,0,0,1]  代表条数据被标注为 2,10 属于 2类也属于10类
    输出也为10类别 输出维度也为10。
    类别从1位置开始0位置代表阈值s就是输出的维度第一个位置是阈值预测
    目标类的分数都大于s，非目标类的分数都小于s
    这里阈值s默认为0故而可忽略只要类从1开始就可
    """
    y_pred = (1 - 2 * y_true) * y_pred
    y_pred_neg = y_pred - y_true * 1e12
    y_pred_pos = y_pred - (1 - y_true) * 1e12


    zeros = paddle.zeros_like(y_pred[..., :1])

    y_pred_neg = paddle.concat((y_pred_neg, zeros), axis=-1)
    y_pred_pos = paddle.concat((y_pred_pos, zeros), axis=-1)


    neg_loss = paddle.logsumexp(y_pred_neg, axis=-1)
    pos_loss = paddle.logsumexp(y_pred_pos, axis=-1)
    sumloss = (neg_loss + pos_loss).mean()
    return sumloss

In [14]:
#定义优化器
from paddlenlp.transformers import LinearDecayWithWarmup
train_steps_per_epoch=len(train_data_loader)
num_training_steps=train_steps_per_epoch*config.EPOCHS

#定义各模块参数
bert_parameters=list(model.bert.named_parameters())
lstm_parameters=list(model.bilstm.named_parameters())
classifier_parameters=list(model.classifier.named_parameters())
no_decay=['bias','LayerNorm.weight']

lr = config.LR

#bert模型、lstm模型、nn.linear的学习率分离，后两个是bert的3倍
optimizer_grouped_parameters=[
    {'params':[p for n,p in bert_parameters if not any(nd in n for nd in no_decay)],
      'lr':lr,'weight_decay':0.01},
    {'params':[p for n,p in bert_parameters if any(nd in n for nd in no_decay)],
      'lr':lr,'weight_decay':0.0},
    {'params':[p for n,p in lstm_parameters if not any(nd in n for nd in no_decay)],
      'lr':lr*3,'weight_decay':0.01},
    {'params':[p for n,p in lstm_parameters if any(nd in n for nd in no_decay)],
      'lr':lr*3,'weight_decay': 0.0},
    {'params':[p for n,p in classifier_parameters if not any(nd in n for nd in no_decay)],
      'lr':lr*3,'weight_decay':0.01},
    {'params':[p for n,p in classifier_parameters if any(nd in n for nd in no_decay)],
      'lr':lr*3,'weight_decay':0.0}]


scheduler = LinearDecayWithWarmup(lr,
    num_training_steps,
    warmup=0
)
optimizer = paddle.optimizer.AdamW(scheduler, parameters=optimizer_grouped_parameters, weight_decay=0.01)
criterion = multilabel_categorical_crossentropy
metric = MultiLabelReport()

In [15]:
import paddle
import numpy as np
import paddle.nn.functional as F

# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    metric.reset()
    losses = []
    results = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        # probs = F.sigmoid(logits)
        probs = logits
        losses.append(loss.numpy())
        metric.update(probs, labels)
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))

    # auc, f1_score, precison, recall = metric.accumulate()
    f1_score, precison, recall = metric.accumulate()
    print("eval loss: %.5f, f1 score: %.5f, precison: %.5f, recall: %.5f" %
          (np.mean(losses), f1_score, precison, recall))
    model.train()
    metric.reset()
    if if_return_results:
        return results
    else:
        return f1_score

In [16]:
label_vocab = ["Self-direction: thought","Self-direction: action","Stimulation","Hedonism","Achievement","Power: dominance","Power: resources","Face","Security: personal","Security: societal","Tradition","Conformity: rules","Conformity: interpersonal","Humility","Benevolence: caring","Benevolence: dependability","Universalism: concern","Universalism: nature","Universalism: tolerance","Universalism: objectivity"]

In [17]:
cd /home/aistudio/model

/home/aistudio/model


In [18]:
import time
ckpt_dir = "Bert_Bilstm_ckpt" # 训练过程中保存模型参数的文件夹

global_step = 0  # 迭代次数
tic_train = time.time()
best_f1_score = 0
best_f1_score2 = 0

# 模型训练
for epoch in range(1, config.EPOCHS + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率、f1分数
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = logits
        metric.update(probs, labels)
        
        # auc, f1_score, _,  _= metric.accumulate()
        f1_score, _,  _= metric.accumulate()


        # 每迭代100次，打印损失函数值、准确率、f1分数、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        scheduler.step()
        
        #每迭代40次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 40 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, valid_data_loader, label_vocab, if_return_results=False)
            eval_f1_score2 = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                paddle.save(model.state_dict(), "model_net.pdparams")
                paddle.save(optimizer.state_dict(), "optimizer.pdopt")
                tokenizer.save_pretrained(save_dir)

global step 10, epoch: 1, batch: 10, loss: 4.17505, f1 score: 0.19221, speed: 1.32 step/s
global step 20, epoch: 1, batch: 20, loss: 4.00801, f1 score: 0.17947, speed: 1.40 step/s
global step 30, epoch: 1, batch: 30, loss: 4.08815, f1 score: 0.16218, speed: 1.35 step/s
global step 40, epoch: 1, batch: 40, loss: 3.98313, f1 score: 0.14874, speed: 1.63 step/s
eval loss: 4.02354, f1 score: 0.02721, precison: 0.05562, recall: 0.05031
eval loss: 3.78912, f1 score: 0.01736, precison: 0.01050, recall: 0.05000


[2023-01-17 18:28:17,443] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:28:17,447] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 50, epoch: 1, batch: 50, loss: 4.19851, f1 score: 0.04540, speed: 0.18 step/s
global step 60, epoch: 1, batch: 60, loss: 3.91001, f1 score: 0.03824, speed: 1.60 step/s
global step 70, epoch: 1, batch: 70, loss: 4.08896, f1 score: 0.03381, speed: 1.51 step/s
global step 80, epoch: 1, batch: 80, loss: 4.05187, f1 score: 0.02902, speed: 1.40 step/s
eval loss: 4.01776, f1 score: 0.00463, precison: 0.04468, recall: 0.00258
eval loss: 3.76241, f1 score: 0.00270, precison: 0.00714, recall: 0.00167
global step 90, epoch: 1, batch: 90, loss: 4.00487, f1 score: 0.01359, speed: 0.52 step/s
global step 100, epoch: 1, batch: 100, loss: 4.07057, f1 score: 0.01656, speed: 1.48 step/s
global step 110, epoch: 1, batch: 110, loss: 3.94773, f1 score: 0.01224, speed: 1.50 step/s
global step 120, epoch: 1, batch: 120, loss: 4.01879, f1 score: 0.02513, speed: 1.60 step/s
eval loss: 4.00904, f1 score: 0.02680, precison: 0.01852, recall: 0.04847
eval loss: 3.75796, f1 score: 0.01736, pr

[2023-01-17 18:30:53,665] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:30:53,669] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 170, epoch: 2, batch: 1, loss: 3.79554, f1 score: 0.26347, speed: 0.17 step/s
global step 180, epoch: 2, batch: 11, loss: 3.74101, f1 score: 0.26199, speed: 1.51 step/s
global step 190, epoch: 2, batch: 21, loss: 3.43290, f1 score: 0.27212, speed: 1.53 step/s
global step 200, epoch: 2, batch: 31, loss: 3.76487, f1 score: 0.27829, speed: 1.62 step/s
eval loss: 3.68686, f1 score: 0.27102, precison: 0.37637, recall: 0.24121
eval loss: 3.43917, f1 score: 0.20729, precison: 0.25095, recall: 0.22117


[2023-01-17 18:32:10,743] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:32:10,747] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 210, epoch: 2, batch: 41, loss: 3.66456, f1 score: 0.31147, speed: 0.17 step/s
global step 220, epoch: 2, batch: 51, loss: 3.70721, f1 score: 0.30044, speed: 1.49 step/s
global step 230, epoch: 2, batch: 61, loss: 3.58504, f1 score: 0.30747, speed: 1.56 step/s
global step 240, epoch: 2, batch: 71, loss: 3.42028, f1 score: 0.31811, speed: 1.51 step/s
eval loss: 3.62001, f1 score: 0.28151, precison: 0.49167, recall: 0.24358
eval loss: 3.46360, f1 score: 0.19137, precison: 0.19433, recall: 0.20766


[2023-01-17 18:33:28,370] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:33:28,374] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 250, epoch: 2, batch: 81, loss: 3.47542, f1 score: 0.29749, speed: 0.17 step/s
global step 260, epoch: 2, batch: 91, loss: 3.65936, f1 score: 0.31107, speed: 1.55 step/s
global step 270, epoch: 2, batch: 101, loss: 3.65753, f1 score: 0.32041, speed: 1.42 step/s
global step 280, epoch: 2, batch: 111, loss: 3.57864, f1 score: 0.32875, speed: 1.38 step/s
eval loss: 3.59629, f1 score: 0.28774, precison: 0.50397, recall: 0.24350
eval loss: 3.40104, f1 score: 0.21154, precison: 0.24304, recall: 0.21705


[2023-01-17 18:34:47,073] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:34:47,077] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 290, epoch: 2, batch: 121, loss: 3.77228, f1 score: 0.33577, speed: 0.17 step/s
global step 300, epoch: 2, batch: 131, loss: 3.53001, f1 score: 0.35737, speed: 1.45 step/s
global step 310, epoch: 2, batch: 141, loss: 3.40752, f1 score: 0.36003, speed: 1.51 step/s
global step 320, epoch: 2, batch: 151, loss: 3.59357, f1 score: 0.35714, speed: 1.45 step/s
eval loss: 3.61059, f1 score: 0.27521, precison: 0.57437, recall: 0.22696
eval loss: 3.37940, f1 score: 0.21188, precison: 0.22992, recall: 0.20996
global step 330, epoch: 2, batch: 161, loss: 3.83676, f1 score: 0.36249, speed: 0.50 step/s
global step 340, epoch: 3, batch: 2, loss: 3.29862, f1 score: 0.36086, speed: 1.53 step/s
global step 350, epoch: 3, batch: 12, loss: 3.25089, f1 score: 0.36824, speed: 1.47 step/s
global step 360, epoch: 3, batch: 22, loss: 3.31865, f1 score: 0.38343, speed: 1.65 step/s
eval loss: 3.57599, f1 score: 0.33382, precison: 0.56835, recall: 0.27351
eval loss: 3.35298, f1 score: 0.210

[2023-01-17 18:36:45,557] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:36:45,560] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 370, epoch: 3, batch: 32, loss: 3.08145, f1 score: 0.46692, speed: 0.17 step/s
global step 380, epoch: 3, batch: 42, loss: 3.63447, f1 score: 0.46465, speed: 1.41 step/s
global step 390, epoch: 3, batch: 52, loss: 3.26290, f1 score: 0.45703, speed: 1.52 step/s
global step 400, epoch: 3, batch: 62, loss: 2.97674, f1 score: 0.45061, speed: 1.49 step/s
eval loss: 3.58062, f1 score: 0.35450, precison: 0.61422, recall: 0.29373
eval loss: 3.44076, f1 score: 0.24938, precison: 0.27475, recall: 0.25457


[2023-01-17 18:38:04,163] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:38:04,167] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 410, epoch: 3, batch: 72, loss: 3.25746, f1 score: 0.45605, speed: 0.17 step/s
global step 420, epoch: 3, batch: 82, loss: 3.23852, f1 score: 0.45543, speed: 1.41 step/s
global step 430, epoch: 3, batch: 92, loss: 3.42116, f1 score: 0.45745, speed: 1.55 step/s
global step 440, epoch: 3, batch: 102, loss: 3.33683, f1 score: 0.46554, speed: 1.65 step/s
eval loss: 3.57411, f1 score: 0.35597, precison: 0.58558, recall: 0.28827
eval loss: 3.30413, f1 score: 0.26980, precison: 0.29452, recall: 0.26257


[2023-01-17 18:39:22,808] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:39:22,812] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 450, epoch: 3, batch: 112, loss: 3.19904, f1 score: 0.46744, speed: 0.17 step/s
global step 460, epoch: 3, batch: 122, loss: 3.63720, f1 score: 0.45721, speed: 1.51 step/s
global step 470, epoch: 3, batch: 132, loss: 2.98776, f1 score: 0.46516, speed: 1.54 step/s
global step 480, epoch: 3, batch: 142, loss: 2.94023, f1 score: 0.46162, speed: 1.46 step/s
eval loss: 3.57407, f1 score: 0.35676, precison: 0.61933, recall: 0.29417
eval loss: 3.35588, f1 score: 0.25421, precison: 0.29025, recall: 0.25915


[2023-01-17 18:40:41,152] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:40:41,156] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 490, epoch: 3, batch: 152, loss: 3.27022, f1 score: 0.43475, speed: 0.17 step/s
global step 500, epoch: 3, batch: 162, loss: 3.06265, f1 score: 0.45471, speed: 1.43 step/s
global step 510, epoch: 4, batch: 3, loss: 3.17545, f1 score: 0.46781, speed: 1.63 step/s
global step 520, epoch: 4, batch: 13, loss: 2.98507, f1 score: 0.47847, speed: 1.41 step/s
eval loss: 3.59649, f1 score: 0.36806, precison: 0.58302, recall: 0.30228
eval loss: 3.30444, f1 score: 0.28010, precison: 0.28724, recall: 0.29316


[2023-01-17 18:41:59,437] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:41:59,443] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 530, epoch: 4, batch: 23, loss: 2.83050, f1 score: 0.52673, speed: 0.17 step/s
global step 540, epoch: 4, batch: 33, loss: 3.01043, f1 score: 0.53154, speed: 1.42 step/s
global step 550, epoch: 4, batch: 43, loss: 2.96290, f1 score: 0.53355, speed: 1.59 step/s
global step 560, epoch: 4, batch: 53, loss: 3.06767, f1 score: 0.53786, speed: 1.37 step/s
eval loss: 3.62657, f1 score: 0.39042, precison: 0.62888, recall: 0.32423
eval loss: 3.37683, f1 score: 0.29188, precison: 0.31715, recall: 0.29769


[2023-01-17 18:43:19,067] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:43:19,070] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 570, epoch: 4, batch: 63, loss: 3.19608, f1 score: 0.53957, speed: 0.17 step/s
global step 580, epoch: 4, batch: 73, loss: 2.80591, f1 score: 0.54612, speed: 1.39 step/s
global step 590, epoch: 4, batch: 83, loss: 2.83892, f1 score: 0.54602, speed: 1.42 step/s
global step 600, epoch: 4, batch: 93, loss: 3.25182, f1 score: 0.54583, speed: 1.43 step/s
eval loss: 3.66902, f1 score: 0.37896, precison: 0.60235, recall: 0.32606
eval loss: 3.39058, f1 score: 0.26985, precison: 0.28794, recall: 0.27480
global step 610, epoch: 4, batch: 103, loss: 2.84032, f1 score: 0.51215, speed: 0.50 step/s
global step 620, epoch: 4, batch: 113, loss: 2.79213, f1 score: 0.54032, speed: 1.38 step/s
global step 630, epoch: 4, batch: 123, loss: 2.85053, f1 score: 0.56429, speed: 1.49 step/s
global step 640, epoch: 4, batch: 133, loss: 2.92140, f1 score: 0.56413, speed: 1.52 step/s
eval loss: 3.64026, f1 score: 0.40388, precison: 0.59021, recall: 0.33830
eval loss: 3.34101, f1 score: 0.258

[2023-01-17 18:45:20,035] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:45:20,039] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 650, epoch: 4, batch: 143, loss: 2.89791, f1 score: 0.51158, speed: 0.17 step/s
global step 660, epoch: 4, batch: 153, loss: 2.86836, f1 score: 0.54498, speed: 1.52 step/s
global step 670, epoch: 4, batch: 163, loss: 3.09798, f1 score: 0.54100, speed: 1.46 step/s
global step 680, epoch: 5, batch: 4, loss: 2.81496, f1 score: 0.55548, speed: 1.44 step/s
eval loss: 3.70512, f1 score: 0.37400, precison: 0.59037, recall: 0.31290
eval loss: 3.32490, f1 score: 0.24955, precison: 0.27040, recall: 0.26541
global step 690, epoch: 5, batch: 14, loss: 3.11175, f1 score: 0.58190, speed: 0.50 step/s
global step 700, epoch: 5, batch: 24, loss: 2.74364, f1 score: 0.60080, speed: 1.54 step/s
global step 710, epoch: 5, batch: 34, loss: 2.79234, f1 score: 0.59827, speed: 1.66 step/s
global step 720, epoch: 5, batch: 44, loss: 2.64291, f1 score: 0.60896, speed: 1.38 step/s
eval loss: 3.77106, f1 score: 0.41119, precison: 0.60048, recall: 0.34550
eval loss: 3.38747, f1 score: 0.30586

[2023-01-17 18:47:17,810] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:47:17,813] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 730, epoch: 5, batch: 54, loss: 3.00727, f1 score: 0.65408, speed: 0.17 step/s
global step 740, epoch: 5, batch: 64, loss: 2.61787, f1 score: 0.63912, speed: 1.48 step/s
global step 750, epoch: 5, batch: 74, loss: 2.96572, f1 score: 0.63108, speed: 1.54 step/s
global step 760, epoch: 5, batch: 84, loss: 2.54783, f1 score: 0.63567, speed: 1.58 step/s
eval loss: 3.81669, f1 score: 0.41298, precison: 0.58019, recall: 0.35026
eval loss: 3.44590, f1 score: 0.29916, precison: 0.31489, recall: 0.31912


[2023-01-17 18:48:35,905] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:48:35,909] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 770, epoch: 5, batch: 94, loss: 2.53716, f1 score: 0.64577, speed: 0.17 step/s
global step 780, epoch: 5, batch: 104, loss: 2.57693, f1 score: 0.63297, speed: 1.45 step/s
global step 790, epoch: 5, batch: 114, loss: 2.67806, f1 score: 0.63766, speed: 1.55 step/s
global step 800, epoch: 5, batch: 124, loss: 2.91243, f1 score: 0.62033, speed: 1.41 step/s
eval loss: 3.81460, f1 score: 0.41498, precison: 0.55833, recall: 0.35860
eval loss: 3.43005, f1 score: 0.30787, precison: 0.30487, recall: 0.33101


[2023-01-17 18:49:54,261] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:49:54,265] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 810, epoch: 5, batch: 134, loss: 2.95571, f1 score: 0.63678, speed: 0.17 step/s
global step 820, epoch: 5, batch: 144, loss: 2.50355, f1 score: 0.63621, speed: 1.41 step/s
global step 830, epoch: 5, batch: 154, loss: 2.71817, f1 score: 0.61971, speed: 1.56 step/s
global step 840, epoch: 5, batch: 164, loss: 2.80428, f1 score: 0.62004, speed: 1.45 step/s
eval loss: 3.82732, f1 score: 0.42371, precison: 0.57591, recall: 0.36592
eval loss: 3.41452, f1 score: 0.30523, precison: 0.29775, recall: 0.32444


[2023-01-17 18:51:12,625] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:51:12,628] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 850, epoch: 6, batch: 5, loss: 2.57413, f1 score: 0.62968, speed: 0.17 step/s
global step 860, epoch: 6, batch: 15, loss: 2.24026, f1 score: 0.65166, speed: 1.49 step/s
global step 870, epoch: 6, batch: 25, loss: 2.41678, f1 score: 0.67193, speed: 1.43 step/s
global step 880, epoch: 6, batch: 35, loss: 2.47935, f1 score: 0.67853, speed: 1.49 step/s
eval loss: 3.96031, f1 score: 0.41603, precison: 0.57917, recall: 0.35870
eval loss: 3.51742, f1 score: 0.29993, precison: 0.29332, recall: 0.32821
global step 890, epoch: 6, batch: 45, loss: 2.18344, f1 score: 0.69638, speed: 0.50 step/s
global step 900, epoch: 6, batch: 55, loss: 2.52250, f1 score: 0.69756, speed: 1.61 step/s
global step 910, epoch: 6, batch: 65, loss: 2.54254, f1 score: 0.69870, speed: 1.49 step/s
global step 920, epoch: 6, batch: 75, loss: 2.58954, f1 score: 0.69404, speed: 1.45 step/s
eval loss: 4.12248, f1 score: 0.40064, precison: 0.58454, recall: 0.35196
eval loss: 3.64168, f1 score: 0.31006, p

[2023-01-17 18:55:11,207] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:55:11,211] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 1050, epoch: 7, batch: 36, loss: 2.22550, f1 score: 0.73342, speed: 0.17 step/s
global step 1060, epoch: 7, batch: 46, loss: 2.20347, f1 score: 0.73228, speed: 1.56 step/s
global step 1070, epoch: 7, batch: 56, loss: 2.20065, f1 score: 0.73084, speed: 1.38 step/s
global step 1080, epoch: 7, batch: 66, loss: 1.97543, f1 score: 0.73994, speed: 1.41 step/s
eval loss: 4.23790, f1 score: 0.41796, precison: 0.57377, recall: 0.36288
eval loss: 3.80046, f1 score: 0.29729, precison: 0.29317, recall: 0.32788
global step 1090, epoch: 7, batch: 76, loss: 2.19041, f1 score: 0.69279, speed: 0.51 step/s
global step 1100, epoch: 7, batch: 86, loss: 2.18563, f1 score: 0.72220, speed: 1.56 step/s
global step 1110, epoch: 7, batch: 96, loss: 2.36460, f1 score: 0.71495, speed: 1.34 step/s
global step 1120, epoch: 7, batch: 106, loss: 2.33747, f1 score: 0.71169, speed: 1.44 step/s
eval loss: 4.31408, f1 score: 0.42144, precison: 0.57216, recall: 0.36443
eval loss: 3.95118, f1 score: 

[2023-01-17 18:57:50,283] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 18:57:50,287] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 1170, epoch: 7, batch: 156, loss: 2.15337, f1 score: 0.73694, speed: 0.17 step/s
global step 1180, epoch: 7, batch: 166, loss: 2.24050, f1 score: 0.72878, speed: 1.60 step/s
global step 1190, epoch: 8, batch: 7, loss: 1.94417, f1 score: 0.72999, speed: 1.48 step/s
global step 1200, epoch: 8, batch: 17, loss: 2.01701, f1 score: 0.74743, speed: 1.47 step/s
eval loss: 4.39105, f1 score: 0.43146, precison: 0.58210, recall: 0.37579
eval loss: 3.90370, f1 score: 0.32863, precison: 0.31554, recall: 0.37981
global step 1210, epoch: 8, batch: 27, loss: 1.81266, f1 score: 0.77360, speed: 0.49 step/s
global step 1220, epoch: 8, batch: 37, loss: 1.68384, f1 score: 0.78153, speed: 1.45 step/s
global step 1230, epoch: 8, batch: 47, loss: 1.55278, f1 score: 0.78863, speed: 1.44 step/s
global step 1240, epoch: 8, batch: 57, loss: 1.90063, f1 score: 0.77823, speed: 1.47 step/s
eval loss: 4.44567, f1 score: 0.43792, precison: 0.56760, recall: 0.38507
eval loss: 3.93742, f1 score: 

[2023-01-17 19:30:14,063] [    INFO] - tokenizer config file saved in Bert_Bilstm_ckpt/tokenizer_config.json
[2023-01-17 19:30:14,068] [    INFO] - Special tokens file saved in Bert_Bilstm_ckpt/special_tokens_map.json


global step 3050, epoch: 19, batch: 8, loss: 0.74896, f1 score: 0.94962, speed: 0.17 step/s
global step 3060, epoch: 19, batch: 18, loss: 0.44952, f1 score: 0.95900, speed: 1.71 step/s
global step 3070, epoch: 19, batch: 28, loss: 0.65855, f1 score: 0.95804, speed: 1.50 step/s
global step 3080, epoch: 19, batch: 38, loss: 0.67777, f1 score: 0.96008, speed: 1.44 step/s
eval loss: 6.36614, f1 score: 0.45142, precison: 0.53887, recall: 0.40893
eval loss: 5.77169, f1 score: 0.30314, precison: 0.29103, recall: 0.35441
global step 3090, epoch: 19, batch: 48, loss: 0.45143, f1 score: 0.97182, speed: 0.50 step/s
global step 3100, epoch: 19, batch: 58, loss: 0.57301, f1 score: 0.96708, speed: 1.52 step/s
global step 3110, epoch: 19, batch: 68, loss: 0.58417, f1 score: 0.96852, speed: 1.34 step/s
global step 3120, epoch: 19, batch: 78, loss: 0.88021, f1 score: 0.96889, speed: 1.52 step/s
eval loss: 6.39479, f1 score: 0.45082, precison: 0.53466, recall: 0.41155
eval loss: 5.78833, f1 s

In [19]:
# 模型加载
# 加载验证集上效果最好的模型参数
model.set_dict(paddle.load('model_net.pdparams'))

# 加载之前训练好的模型参数
# model.set_dict(paddle.load('/home/aistudio/work/model_state.pdparams'))

# 模型验证
print("ERNIE 3.0 在20分类验证集的最佳表现：", end= " ")
results1 = evaluate(model, criterion, metric, valid_data_loader, label_vocab)
print("ERNIE 2.0 在20分类知乎验证集的最佳表现：", end= " ")
results = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab)

ERNIE 3.0 在20分类验证集的最佳表现： eval loss: 6.37937, f1 score: 0.45334, precison: 0.54576, recall: 0.40965
ERNIE 2.0 在20分类知乎验证集的最佳表现： eval loss: 5.69121, f1 score: 0.31063, precison: 0.30001, recall: 0.35761


In [20]:
def result2tsv(result1,filepath):
    validData = pd.read_csv(filepath,sep='\t')
    dictvalidT =validData.to_dict("list")

    validPred = {}
    validPred["Argument ID"] = dictvalidT["Argument ID"]
    validPred["sentence"] = dictvalidT["sentence"]

    for x in label_vocab:
        validPred[x] = []
    
    for x in range(len(results1)):
        types = results1[x].split(",")
        if types == ['']:
            for y in label_vocab:
                validPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    validPred[z].append(1)
                else:
                    validPred[z].append(0)
    validData = pd.read_csv(filepath,sep='\t')
    for x in label_vocab:
        for y in range(len(validData[x])):
            validData[x].iloc[y] = validPred[x][y]
    
    validData.drop(columns=["sentence"],inplace=True)
    return validData

In [21]:
%cd /home/aistudio/data

/home/aistudio/data


In [22]:
import pandas as pd
valid = result2tsv(results1,"validation.tsv")
valid.to_csv('validbert.tsv',columns=valid.columns.tolist(),
            sep='\t',
            index=False)

validzhihu = result2tsv(results,"zhihu_validation.tsv")
validzhihu.to_csv('validbertzhihu.tsv',columns=validzhihu.columns.tolist(),
            sep='\t',
            index=False)

In [26]:
# 预测函数，对测试集结果进行预测
def predict(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    results = []
    for batch in data_loader:
        input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
        logits = model(input_ids, token_type_ids)
        probs = logits
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))
    if if_return_results:
        return results
    else:
        return f1_score

In [24]:
def testOutput(results2,filepath):
    testData = pd.read_csv(filepath,sep='\t')
    dicttestT =testData.to_dict("list")

    testPred = {}
    testPred["Argument ID"] = dicttestT["Argument ID"]
    testPred["sentence"] = dicttestT["sentence"]

    for x in label_vocab:
        testPred[x] = []

    for x in range(len(results2)):
        types = results2[x].split(",")
        if types == ['']:
            for y in label_vocab:
                testPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    testPred[z].append(1)
                else:
                    testPred[z].append(0)
    testPredD = pd.DataFrame.from_dict(testPred)
    testPredD.drop(columns=["sentence"],inplace=True)

    return testPredD

In [27]:
results2 = predict(model, criterion, metric, test_data_loader, label_vocab)
test = testOutput(results2,"test.tsv")
test.to_csv('testBert.tsv',columns=test.columns.tolist(),
            sep='\t',
            index=False)

In [31]:
results3 = predict(model, criterion, metric, test2_data_loader, label_vocab)
test2 = testOutput(results3,"test2.tsv")
test2.to_csv('test222Bert.tsv',columns=test2.columns.tolist(),
            sep='\t',
            index=False)