In [1]:
import os
import paddle
import paddlenlp

In [2]:
# 自定义数据集
import re

from paddlenlp.datasets import load_dataset

# 清洗无效字符
def clean_text(text):
    text = text.replace("\r", "").replace("\n", "")
    text = re.sub(r"\\n\n", ".", text)
    return text

# 定义读取数据集函数
def read_custom_data(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        labels = [float(d) for d in data[2:]]
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels": labels}
    f.close()

def read_custom_data_test(filepath):
    f = open(filepath)
    next(f)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        yield {"Argument ID": data[0], "sentence":clean_text(data[1]),"labels":[]}
    f.close()

In [3]:
%cd /home/aistudio/data

/home/aistudio/data


In [4]:
# load_dataset()创建数据集
# lazy=False，数据集返回为MapDataset类型
# 对训练集和验证集进行预处理
train_ds = load_dataset(read_custom_data, filepath='train.tsv', lazy=False) 
valid_ds = load_dataset(read_custom_data, filepath='validation.tsv', lazy=False)
validZhihu_ds = load_dataset(read_custom_data, filepath='zhihu_validation.tsv', lazy=False)
test_ds = load_dataset(read_custom_data_test, filepath='test.tsv', lazy=False) 
test2_ds = load_dataset(read_custom_data_test, filepath='test2.tsv', lazy=False) 
test3_ds = load_dataset(read_custom_data_test, filepath='test3.tsv', lazy=False) 

In [5]:
import paddle
from paddlenlp.transformers.xlnet.modeling import XLNetForSequenceClassification
from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer

num_classes = 20  # 20分类任务
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_classes=num_classes)

[2023-01-22 19:23:26,953] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-large-cased-spiece.model and saved to /home/aistudio/.paddlenlp/models/xlnet-large-cased
[2023-01-22 19:23:26,956] [    INFO] - Downloading xlnet-large-cased-spiece.model from https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-large-cased-spiece.model
100%|██████████| 779k/779k [00:00<00:00, 23.1MB/s]
[2023-01-22 19:23:27,226] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/xlnet-large-cased/tokenizer_config.json
[2023-01-22 19:23:27,237] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/xlnet-large-cased/special_tokens_map.json
[2023-01-22 19:23:27,241] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-base-cased.pdparams and saved to /home/aistudio/.paddlenlp/models/xlnet-base-cased
[2023-01-22 19:23:27,244] [    INFO] - Downloading xlnet-base-cased.pdparams fr

In [6]:
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# 数据预处理函数，利用分词器将文本转化为整数序列
def preprocess_function(examples, tokenizer, max_seq_length):
    result = tokenizer(text=examples["sentence"], max_seq_len=max_seq_length)
    result["labels"] = examples["labels"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=128)
train_ds = train_ds.map(trans_func)
valid_ds = valid_ds.map(trans_func)
validZhihu_ds = validZhihu_ds.map(trans_func)
test_ds = test_ds.map(trans_func)
test2_ds = test2_ds.map(trans_func)
test3_ds = test3_ds.map(trans_func)

# collate_fn函数构造，将不同长度序列充到批中数据的最大长度，再将数据堆叠
collate_fn = DataCollatorWithPadding(tokenizer)

# 定义BatchSampler，选择批大小和是否随机乱序，进行DataLoader
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
valid_batch_sampler = BatchSampler(valid_ds, batch_size=16, shuffle=False)
validZhihu_batch_sampler = BatchSampler(validZhihu_ds, batch_size=16, shuffle=False)
test_batch_sampler = BatchSampler(test_ds, batch_size=16, shuffle=False)
test2_batch_sampler = BatchSampler(test2_ds, batch_size=16, shuffle=False)
test3_batch_sampler = BatchSampler(test3_ds, batch_size=16, shuffle=False)

train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
valid_data_loader = DataLoader(dataset=valid_ds, batch_sampler=valid_batch_sampler, collate_fn=collate_fn)
validZhihu_data_loader = DataLoader(dataset=validZhihu_ds, batch_sampler=validZhihu_batch_sampler, collate_fn=collate_fn)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)
test2_data_loader = DataLoader(dataset=test2_ds, batch_sampler=test2_batch_sampler, collate_fn=collate_fn)
test3_data_loader = DataLoader(dataset=test3_ds, batch_sampler=test3_batch_sampler, collate_fn=collate_fn)

In [7]:
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from paddle.metric import Metric

# 自定义MultiLabelReport评价指标
class MultiLabelReport(Metric):
    """
    AUC and F1 Score for multi-label text classification task.
    """

    def __init__(self, name='MultiLabelReport', average='macro'):
        super(MultiLabelReport, self).__init__()
        self.average = average
        self._name = name
        self.reset()

    # def f1_score(self, y_prob):
    #     '''
    #     Returns the f1 score by searching the best threshhold
    #     '''
    #     best_score = 0
    #     for threshold in [i * 0.01 for i in range(100)]:
    #         self.y_pred = y_prob > threshold
    #         score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #         if score > best_score:
    #             best_score = score
    #             precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #             recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
    #     return best_score, precison, recall

    def f1_score(self, y_prob):
        '''
        Returns the f1 score by searching the best threshhold
        '''
        thresholds =0
        self.y_pred = y_prob > thresholds
        score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        return score, precison, recall

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.y_prob = None
        self.y_true = None

    def update(self, probs, labels):
        if self.y_prob is not None:
            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
        else:
            self.y_prob = probs.numpy()
        if self.y_true is not None:
            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
        else:
            self.y_true = labels.numpy()

    def accumulate(self):
        # auc = roc_auc_score(
        #     y_score=self.y_prob, y_true=self.y_true, average=self.average)
        f1_score, precison, recall = self.f1_score(y_prob=self.y_prob)
        # return auc, f1_score, precison, recall
        return f1_score, precison, recall
    

    def name(self):
        """
        Returns metric name
        """
        return self._name

In [8]:
import numpy as np
def multilabel_categorical_crossentropy(y_true, y_pred):
    """多标签分类的交叉熵
    说明：y_true和y_pred的shape一致，y_true的元素非0即1，
         1表示对应的类为目标类，0表示对应的类为非目标类。
    警告：请保证y_pred的值域是全体实数，换言之一般情况下y_pred
         不用加激活函数，尤其是不能加sigmoid或者softmax！预测
         阶段则输出y_pred大于0的类。如有疑问，请仔细阅读并理解
         本文。
    假如类别总数为10
    label ：[0,1,0,0,0,0,0,0,0,1]  代表条数据被标注为 2,10 属于 2类也属于10类
    输出也为10类别 输出维度也为10。
    类别从1位置开始0位置代表阈值s就是输出的维度第一个位置是阈值预测
    目标类的分数都大于s，非目标类的分数都小于s
    这里阈值s默认为0故而可忽略只要类从1开始就可
    """
    y_pred = (1 - 2 * y_true) * y_pred
    y_pred_neg = y_pred - y_true * 1e12
    y_pred_pos = y_pred - (1 - y_true) * 1e12


    zeros = paddle.zeros_like(y_pred[..., :1])

    y_pred_neg = paddle.concat((y_pred_neg, zeros), axis=-1)
    y_pred_pos = paddle.concat((y_pred_pos, zeros), axis=-1)


    neg_loss = paddle.logsumexp(y_pred_neg, axis=-1)
    pos_loss = paddle.logsumexp(y_pred_pos, axis=-1)
    return neg_loss + pos_loss

In [9]:
import time
import paddle.nn.functional as F
#定义优化器
from paddlenlp.transformers import LinearDecayWithWarmup

epochs = 20

train_steps_per_epoch=len(train_data_loader)
num_training_steps=train_steps_per_epoch*epochs


# AdamW优化器、交叉熵损失函数、自定义MultiLabelReport评价指标
criterion = multilabel_categorical_crossentropy
metric = MultiLabelReport()
scheduler = LinearDecayWithWarmup(4e-5,
    num_training_steps,
    warmup=0
)
optimizer = paddle.optimizer.AdamW(scheduler, parameters=model.parameters(), weight_decay=0.01)

In [10]:
label_vocab = ["Self-direction: thought","Self-direction: action","Stimulation","Hedonism","Achievement","Power: dominance","Power: resources","Face","Security: personal","Security: societal","Tradition","Conformity: rules","Conformity: interpersonal","Humility","Benevolence: caring","Benevolence: dependability","Universalism: concern","Universalism: nature","Universalism: tolerance","Universalism: objectivity"]

In [10]:
import paddle
import numpy as np
import paddle.nn.functional as F

# 构建验证集evaluate函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    metric.reset()
    losses = []
    results = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        logits = model(input_ids, token_type_ids)
        loss = criterion(labels,logits)
        # probs = F.sigmoid(logits)
        probs = logits
        loss = loss.mean()
        losses.append(loss.numpy())
        metric.update(probs, labels)
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))

    # auc, f1_score, precison, recall = metric.accumulate()
    f1_score, precison, recall = metric.accumulate()
    print("eval loss: %.5f, f1 score: %.5f, precison: %.5f, recall: %.5f" %
          (np.mean(losses), f1_score, precison, recall))
    model.train()
    metric.reset()
    if if_return_results:
        return results
    else:
        return f1_score

In [11]:
%cd /home/aistudio/model

/home/aistudio/model


In [12]:
epochs = 20 # 训练轮次
ckpt_dir = "xlnet_222ckpt" # 训练过程中保存模型参数的文件夹

global_step = 0  # 迭代次数
tic_train = time.time()
best_f1_score = 0
best_f1_score2 = 0

# 模型训练
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # 计算模型输出、损失函数值、分类概率值、准确率、f1分数
        logits = model(input_ids, token_type_ids)
        print(logits)
        print(logits.shape)
        print(labels)
        print(labels.shape)
        
        loss = criterion(labels, logits)
        loss = loss.mean()
        probs = logits
        #probs = F.sigmoid(logits)
        metric.update(probs, labels)
        break
        
        # auc, f1_score, _,  _= metric.accumulate()
        f1_score, _,  _= metric.accumulate()


        # 每迭代100次，打印损失函数值、准确率、f1分数、计算速度
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # 反向梯度回传，更新参数
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        
        #每迭代40次，评估当前训练的模型、保存当前最佳模型参数和分词器的词表等
        if global_step % 40 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, valid_data_loader, label_vocab, if_return_results=False)
            eval_f1_score2 = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

        [0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
         0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 1.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
         1., 0.],
        [0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 1.],
        [1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0.]])
[32, 20]
Tensor(shape=[32, 20], dtype=float32, place=Place(gpu:0), stop_gradient=False,


In [16]:
cd /home/aistudio/model

/home/aistudio/model


In [17]:
# 模型加载
# 加载最后的模型参数
model.set_dict(paddle.load('xlnet_ckpt/model_state.pdparams'))

# 加载之前训练好的模型参数
# model.set_dict(paddle.load('/home/aistudio/work/model_state.pdparams'))

# 模型验证
print("ERNIE 3.0 在20分类验证集的最佳表现：", end= " ")
results1 = evaluate(model, criterion, metric, valid_data_loader, label_vocab)
print("ERNIE 2.0 在20分类知乎验证集的最佳表现：", end= " ")
results = evaluate(model, criterion, metric, validZhihu_data_loader, label_vocab)

ERNIE 3.0 在20分类验证集的最佳表现： eval loss: 6.68658, f1 score: 0.45161, precison: 0.49746, recall: 0.43436
ERNIE 2.0 在20分类知乎验证集的最佳表现： eval loss: 6.57380, f1 score: 0.32409, precison: 0.27750, recall: 0.42437


In [18]:
def result2tsv(result1,filepath):
    validData = pd.read_csv(filepath,sep='\t')
    dictvalidT =validData.to_dict("list")

    validPred = {}
    validPred["Argument ID"] = dictvalidT["Argument ID"]
    validPred["sentence"] = dictvalidT["sentence"]

    for x in label_vocab:
        validPred[x] = []
    
    for x in range(len(results1)):
        types = results1[x].split(",")
        if types == ['']:
            for y in label_vocab:
                validPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    validPred[z].append(1)
                else:
                    validPred[z].append(0)
    validData = pd.read_csv(filepath,sep='\t')
    for x in label_vocab:
        for y in range(len(validData[x])):
            validData[x].iloc[y] = validPred[x][y]
    
    validData.drop(columns=["sentence"],inplace=True)
    return validData

In [14]:
%cd /home/aistudio/data

/home/aistudio/data


In [22]:
import pandas as pd
valid = result2tsv(results1,"validation.tsv")
valid.to_csv('validxlnet.tsv',columns=valid.columns.tolist(),
            sep='\t',
            index=False)

validzhihu = result2tsv(results,"zhihu_validation.tsv")
validzhihu.to_csv('validxlnetzhihu.tsv',columns=validzhihu.columns.tolist(),
            sep='\t',
            index=False)

In [15]:
# 预测函数，对测试集结果进行预测
def predict(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    results = []
    for batch in data_loader:
        input_ids, token_type_ids = batch['input_ids'], batch['token_type_ids']
        logits = model(input_ids, token_type_ids)
        probs = logits
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0:
                        result.append(label_vocab[c])
                        # result.append(str(c))
                results.append(','.join(result))
    if if_return_results:
        return results
    else:
        return f1_score


In [16]:
def testOutput(results2, filepath):
    testData = pd.read_csv(filepath,sep='\t')
    dicttestT =testData.to_dict("list")

    testPred = {}
    testPred["Argument ID"] = dicttestT["Argument ID"]
    testPred["sentence"] = dicttestT["sentence"]

    for x in label_vocab:
        testPred[x] = []

    for x in range(len(results2)):
        types = results2[x].split(",")
        if types == ['']:
            for y in label_vocab:
                testPred[y].append(0)
        else:  
            for z in label_vocab:
                if z in types:
                    testPred[z].append(1)
                else:
                    testPred[z].append(0)
    testPredD = pd.DataFrame.from_dict(testPred)
    testPredD.drop(columns=["sentence"],inplace=True)

    return testPredD

In [25]:
results2 = predict(model, criterion, metric, test_data_loader, label_vocab)
test = testOutput(results2,"test.tsv")
test.to_csv('testxlnet.tsv',columns=test.columns.tolist(),
            sep='\t',
            index=False)


In [26]:
results3 = predict(model, criterion, metric, test2_data_loader, label_vocab)
test2 = testOutput(results3,"test2.tsv")
test2.to_csv('testxlnet222.tsv',columns=test2.columns.tolist(),
            sep='\t',
            index=False)

In [19]:
import pandas as pd

In [20]:
results4 = predict(model, criterion, metric, test3_data_loader, label_vocab)
test3 = testOutput(results4,"test3.tsv")
test3.to_csv('testxlnet333.tsv',columns=test3.columns.tolist(),
            sep='\t',
            index=False)