# 文本分类实例

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification , Trainer , TrainingArguments
from datasets import load_dataset

## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Generating train split: 7766 examples [00:00, 47937.65 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 44743.15 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 创建Dataloader

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 12650.33 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 8932.88 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

## Step5 创建模型

In [5]:

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step6 训练与验证

In [6]:
import evaluate

acc_eval = evaluate.load("accuracy")
f1_eval = evaluate.load("f1")


In [7]:
def eval_metric(pred):
    predictions , labels = pred
    predictions = predictions.argmax(axis = -1)
    acc = acc_eval.compute(predictions= predictions , references= labels)
    f1 = f1_eval.compute(predictions= predictions , references= labels)
    acc.update(f1)
    return acc


In [8]:
train_args = TrainingArguments(output_dir= "./checkpoint")

In [9]:
from transformers import DataCollatorWithPadding
Trainer = Trainer (model= model ,
                  args= train_args , 
                  train_dataset= tokenized_datasets["train"], 
                  eval_dataset= tokenized_datasets["test"],
                  data_collator= DataCollatorWithPadding(tokenizer = tokenizer),
                  compute_metrics= eval_metric
                  )

## Step7 模型训练

In [10]:
Trainer.train()



Step,Training Loss
500,0.3775
1000,0.3043
1500,0.2438
2000,0.1991
2500,0.1523




TrainOutput(global_step=2622, training_loss=0.2527694753069263, metrics={'train_runtime': 2707.3409, 'train_samples_per_second': 7.743, 'train_steps_per_second': 0.968, 'total_flos': 348770878551552.0, 'train_loss': 0.2527694753069263, 'epoch': 3.0})

In [11]:
Trainer.evaluate(tokenized_datasets["test"])



{'eval_loss': 0.49434980750083923,
 'eval_accuracy': 0.8893178893178894,
 'eval_f1': 0.9188679245283019,
 'eval_runtime': 10.2769,
 'eval_samples_per_second': 75.607,
 'eval_steps_per_second': 9.536,
 'epoch': 3.0}

In [12]:
Trainer.predict(tokenized_datasets["test"])



PredictionOutput(predictions=array([[-3.620689  ,  2.6603    ],
       [-4.1341586 ,  3.051038  ],
       [-3.9512756 ,  2.8261774 ],
       ...,
       [ 0.64868444, -1.8509561 ],
       [-4.197096  ,  3.1045926 ],
       [-2.9851227 ,  2.0098712 ]], dtype=float32), label_ids=array([0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1

## Step8 模型预测

In [14]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [15]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

Device set to use cpu


In [16]:
pipe(sen)

[{'label': '好评！', 'score': 0.9993784427642822}]