In [26]:
# 导入相关的库
import functools
import os

import paddle
import paddle.nn.functional as F
from paddle.io import BatchSampler, DataLoader
from utils import preprocess_function, read_local_dataset

from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import load_dataset, MapDataset
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer


In [34]:
# 对单条语句进行主题分类
def predict_classify(sentence):
    """
    Predicts the data labels.
    """
    paddle.set_device("gpu")
    model = AutoModelForSequenceClassification.from_pretrained("./checkpoint/")
    tokenizer = AutoTokenizer.from_pretrained("./checkpoint/")

    label_list = []
    label_path = os.path.join("data", "label.txt")
    with open(label_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            label_list.append(line.strip())

    data_ds = MapDataset([{"sentence": sentence}])

    trans_func = functools.partial(
        preprocess_function,
        tokenizer=tokenizer,
        max_seq_length=128,
        label_nums=len(label_list),
        is_test=True,
    )

    data_ds = data_ds.map(trans_func)

    # batchify dataset
    collate_fn = DataCollatorWithPadding(tokenizer)
    data_batch_sampler = BatchSampler(data_ds, batch_size=16, shuffle=False)

    data_data_loader = DataLoader(dataset=data_ds, batch_sampler=data_batch_sampler, collate_fn=collate_fn)

    results = []
    model.eval()
    for batch in data_data_loader:
        logits = model(**batch)
        probs = F.sigmoid(logits).numpy()
        for prob in probs:
            labels = []
            for i, p in enumerate(prob):
                if p > 0.5:
                    labels.append(i)
            results.append(labels)
    predict_labels = []
    for d, result in zip(data_ds.data, results):
        label = [label_list[r] for r in result]
        predict_labels.append(",".join(label))
    return predict_labels

In [33]:
batch = data_data_loader.__iter__().__next__()
logits = model(**batch)
print(logits)
probs = F.sigmoid(logits).numpy()
print(probs)

Tensor(shape=[1, 7], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [[-3.67354250, -4.37633419,  3.20282364, -4.19195175, -4.40284300,
         -3.83011103, -4.48348856]])
[[0.02475787 0.01241528 0.9609404  0.01489164 0.01209442 0.02124601
  0.01116782]]


In [25]:
data_ds = load_dataset(
    read_local_dataset, path=os.path.join("data", "data.txt"), is_test=True, lazy=False
)
data_ds.__getitem__(1)
# data_ds

{'sentence': '达叔: 看到'}

In [35]:
batch

{'input_ids': Tensor(shape=[1, 7], dtype=int64, place=Place(gpu:0), stop_gradient=True,
        [[1   , 302 , 1985, 12049, 335 , 45  , 2   ]]),
 'token_type_ids': Tensor(shape=[1, 7], dtype=int64, place=Place(gpu:0), stop_gradient=True,
        [[0, 0, 0, 0, 0, 0, 0]])}

In [30]:
dat = MapDataset([{"sentence": "达叔: 看到"}])
dat.__getitem__(0)

{'sentence': '达叔: 看到'}

In [39]:
# 预测单个句子
predict_classify("中国科幻电影: 严重滞后于世界")

[32m[2023-08-29 12:58:48,793] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load './checkpoint/'.[0m
[32m[2023-08-29 12:58:48,794] [    INFO][0m - Loading configuration file ./checkpoint/config.json[0m
[32m[2023-08-29 12:58:48,796] [    INFO][0m - Loading weights file ./checkpoint/model_state.pdparams[0m
[32m[2023-08-29 12:58:49,250] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
[32m[2023-08-29 12:58:50,184] [    INFO][0m - All model checkpoint weights were used when initializing ErnieForSequenceClassification.
[0m
[32m[2023-08-29 12:58:50,185] [    INFO][0m - All the weights of ErnieForSequenceClassification were initialized from the model checkpoint at ./checkpoint/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ErnieForSequenceClassification for predictions without further training.[0m
[32m[2023-08-29 12:58:50,212] [ 

['人物塑造,价值观念,影视特效,故事情节,演员演技']