In [None]:

import paddlenlp
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer

In [None]:
# 得到数据集字典
def open_func(file_path):
    return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]

data_dict = {'SBIC': {'test': open_func('SBIC/test.tsv'),
                              'dev': open_func('SBIC/dev.tsv'),
                              'train': open_func('SBIC/train.tsv')},
             }

In [None]:
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]


class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, for_test=False):
        super().__init__()
        self._data = data
        self._tokenizer = tokenizer
        self._max_len = max_len
        self._for_test = for_test
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, idx):
        samples = self._data[idx].split('\t')
        label = samples[-2]
        text = samples[-1]
        label = int(label)
        text = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
        if self._for_test:
            return np.array(text, dtype='int64')
        else:
            return np.array(text, dtype='int64'), np.array(label, dtype='int64')

def batchify_fn(for_test=False):
    if for_test:
        return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
    else:
        return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
                                        Stack()): [data for data in fn(samples)]


def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
    dataset = MyDataset(data, tokenizer, max_len, for_test)
    shuffle = True if not for_test else False
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
    return data_loader

In [None]:
import paddle
from paddle.static import InputSpec

In [None]:
# 模型和分词
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')

# 参数设置
data_name = 'SBIC'  # 更改此选项改变数据集

## 训练相关
epochs = 5
learning_rate = 2e-5
batch_size = 16
max_len = 256

## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)
if data_name == 'SBIC':
    dev_dataloader = get_data_loader(data_dict[data_name]['dev'], tokenizer, batch_size, max_len, for_test=False)
else:
    dev_dataloader = None

input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, 2), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])

In [None]:
# 模型准备

optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[paddle.metric.Accuracy()])

[2021-06-18 15:45:38,266] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/skep_ernie_1.0_large_ch/skep_ernie_1.0_large_ch.pdparams
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py:1297: UserWarning: Skip loading for classifier.weight. classifier.weight is not found in the provided dict.
  warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py:1297: UserWarning: Skip loading for classifier.bias. classifier.bias is not found in the provided dict.
  warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
[2021-06-18 15:45:49,343] [    INFO] - Found /home/aistudio/.paddlenlp/models/skep_ernie_1.0_large_ch/skep_ernie_1.0_large_ch.vocab.txt

In [None]:
# 开始训练
model.fit(train_dataloader, dev_dataloader, batch_size, epochs, eval_freq=5, save_freq=1, save_dir='./checkpoints', log_freq=200)

In [None]:
The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5

/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  return (isinstance(seq, collections.Sequence) and

step 200/600 - loss: 0.2172 - acc: 0.8881 - 647ms/step
step 400/600 - loss: 0.1809 - acc: 0.9062 - 650ms/step
step 600/600 - loss: 0.1273 - acc: 0.9123 - 648ms/step
save checkpoint at /home/aistudio/checkpoints/0
Eval begin...
step 75/75 - loss: 0.0813 - acc: 0.9450 - 229ms/step
Eval samples: 1200
Epoch 2/5
step 200/600 - loss: 0.0296 - acc: 0.9706 - 647ms/step
step 400/600 - loss: 0.0918 - acc: 0.9675 - 649ms/step
step 600/600 - loss: 0.0284 - acc: 0.9675 - 649ms/step
save checkpoint at /home/aistudio/checkpoints/1
Epoch 3/5
step 200/600 - loss: 0.0048 - acc: 0.9838 - 647ms/step
step 400/600 - loss: 0.2752 - acc: 0.9827 - 644ms/step
step 600/600 - loss: 0.0124 - acc: 0.9834 - 645ms/step
save checkpoint at /home/aistudio/checkpoints/2
Epoch 4/5
step 200/600 - loss: 0.0023 - acc: 0.9866 - 652ms/step
step 400/600 - loss: 0.0060 - acc: 0.9881 - 653ms/step
step 600/600 - loss: 0.0047 - acc: 0.9866 - 649ms/step
save checkpoint at /home/aistudio/checkpoints/3
Epoch 5/5
step 200/600 - loss: 7.8751e-04 - acc: 0.9884 - 650ms/step
step 400/600 - loss: 0.0011 - acc: 0.9905 - 648ms/step
step 600/600 - loss: 0.0012 - acc: 0.9908 - 649ms/step
save checkpoint at /home/aistudio/checkpoints/4
save checkpoint at /home/aistudio/checkpoints/final