# 文本分类实例

step 1 导入相关的包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

step 2 加载数据

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x : x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

step 4 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1, train_size=0.9)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

step 5 创建dataloader

In [6]:
import torch

tokenizer = AutoTokenizer.from_pretrained("./rbt3")

def process_func(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set = tokenized_datasets["train"]
valid_set = tokenized_datasets["test"]
collate_func = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_func)
validloader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=collate_func)

In [10]:
print(next(enumerate(validloader))[1])
print(next(enumerate(trainloader))[1]["input_ids"].shape)

{'input_ids': tensor([[ 101,  912, 2139,  ..., 2218, 3221,  102],
        [ 101, 2791, 7313,  ..., 1741,  852,  102],
        [ 101, 2600,  860,  ..., 8024, 6432,  102],
        ...,
        [ 101, 2523, 2345,  ...,    0,    0,    0],
        [ 101, 3862,  677,  ..., 2190,  671,  102],
        [ 101, 6821, 3221,  ..., 2769, 1348,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
        1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        1, 0, 1

step 6 创建模型和优化器

In [11]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("rbt3", ).cuda()
optimizer = Adam(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


step 7 训练模型以及验证

In [None]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1"])

In [None]:
def evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            clf_metrics.add_batch(predictions=pred.long(), references=batch["labels"].long())
    return clf_metrics.compute()

def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        clf = evaluate()
        print(f"ep: {ep}, clf: {clf}")

step 8 train

In [13]:
train()

ep: 0, global_step: 0, loss: 1.295169711112976
ep: 0, global_step: 100, loss: 0.21491168439388275
ep: 0, global_step: 200, loss: 0.23082424700260162
ep: 0, acc: 0.8674388527870178
ep: 1, global_step: 300, loss: 0.3235291838645935
ep: 1, global_step: 400, loss: 0.22636674344539642
ep: 1, acc: 0.8893178701400757
ep: 2, global_step: 500, loss: 0.26278364658355713
ep: 2, global_step: 600, loss: 0.11667513847351074
ep: 2, acc: 0.8918918967247009


step 9 模型预测

In [14]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k : v.cuda() for k ,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(pred)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

tensor([1], device='cuda:0')
输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [15]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe(sen)

[{'label': '好评！', 'score': 0.994797945022583}]