# 文本分类实例

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

In [3]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [4]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [5]:
import torch 

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(example):
    tokenized_examples = tokenizer(example["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = example["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_datasets["train"], tokenized_datasets["test"]
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))


In [11]:
next(enumerate(train_loader))[1]

{'input_ids': tensor([[ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101, 6821, 3221,  ...,    0,    0,    0],
        [ 101, 6421, 6983,  ...,    0,    0,    0],
        ...,
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101,  788,  857,  ...,    0,    0,    0],
        [ 101, 2600,  860,  ..., 3341, 8024,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 1, 1])}

In [12]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
optimizer = Adam(model.parameters(), lr=2e-5)

In [15]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valid_loader:
            if torch.cuda.is_available():
                batch = {k : v.cuda() for k , v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(valid_set)
def train(epoch=5, log_step=100):

    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k : v.cuda() for k , v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")


In [16]:
train()

ep: 0, global_step: 0, loss: 0.6235082745552063
ep: 0, global_step: 100, loss: 0.45738112926483154
ep: 0, global_step: 200, loss: 0.09849672764539719
ep: 0, acc: 0.8918918967247009
ep: 1, global_step: 300, loss: 0.1366640031337738
ep: 1, global_step: 400, loss: 0.17773212492465973
ep: 1, acc: 0.8957529067993164
ep: 2, global_step: 500, loss: 0.08931885659694672
ep: 2, global_step: 600, loss: 0.15524494647979736
ep: 2, acc: 0.8893178701400757
ep: 3, global_step: 700, loss: 0.13700026273727417
ep: 3, global_step: 800, loss: 0.033404067158699036
ep: 3, acc: 0.8983269333839417
ep: 4, global_step: 900, loss: 0.1966446489095688
ep: 4, global_step: 1000, loss: 0.10962805896997452
ep: 4, acc: 0.88416987657547


In [1]:
sen = "我觉得这家酒店不好吃"
id2_label = {0:"差评!", 1:"好评!"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt')
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label[pred.item()]}")


NameError: name 'model' is not defined

In [57]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

pipe(sen)

Device set to use cuda:0


[{'label': '差评!', 'score': 0.8165022730827332}]

In [60]:
from transformers import DataCollatorWithPadding
import datasets
from datasets import load_dataset

In [61]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Generating train split: 7766 examples [00:00, 71127.78 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 113969.59 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [62]:
def process_function(example):
    tokenized_examples = tokenizer(example["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = example["label"]
    return tokenized_examples

In [63]:
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

Map: 100%|██████████| 7765/7765 [00:00<00:00, 7985.03 examples/s]


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7765
})

In [64]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator, shuffle=True)

In [72]:
next(enumerate(dl))[1]

{'input_ids': tensor([[ 101, 2791, 7313, 3191,  749, 4157, 8024, 3300,  671, 3613, 6375, 2769,
         2697, 6230, 3300, 4157, 1927, 3307, 8024, 1369, 1912,  679, 1419, 3193,
         7623, 8024, 1762,  753, 3517, 1296, 4157, 3683, 6772, 6586,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101,  122,  510, 6421, 6983, 2421, 2791, 7313, 2207,  117, 2769,  857,
         4638, 3403, 1114, 7313, 2523