# 文本分类实例

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import load_dataset

# 加载数据

In [28]:
dataset = load_dataset('csv',data_files='../ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x: x['review'] is not None) #数据清洗
dataset = dataset.train_test_split(test_size=0.2)  #划分数据集
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

# 数据预处理

In [29]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def preprocess_function(examples):  #数据处理
    tokenized_examples = tokenizer(examples["review"], truncation=True, max_length=128)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["review", "label"])
tokenized_datasets 

Map:   0%|          | 0/6212 [00:00<?, ? examples/s]

Map:   0%|          | 0/1553 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

In [30]:
from transformers import DataCollatorWithPadding    

# 创建模型

In [65]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3").cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 创建评估函数

In [32]:
import evaluate as eva
acc_metric = eva.load("accuracy")
f1_metric  = eva.load("f1")

In [33]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1  = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

# Trainer的使用
使用文档：https://huggingface.co/docs/transformers/trainer

In [80]:
train_args = TrainingArguments(
    output_dir='./check_points',  #结果输出路径
    per_device_train_batch_size=64, #训练批次大小
    per_device_eval_batch_size=128, #验证批次大小
    logging_steps=10,               #日志记录间隔
    logging_strategy='steps',
    eval_strategy='epoch',          #按epoch间隔打印验证结果
    save_strategy='epoch',
    save_total_limit=2,             #保存的模型数量
    learning_rate=1e-5,
    weight_decay=0.01,
    metric_for_best_model='accuracy',  #指定评估指标
    load_best_model_at_end=True,       #训练结束是否加载最佳模型
    run_name='runs'
)
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp16

In [78]:
trainer = Trainer(model=model, 
                args=train_args, 
                train_dataset=tokenized_datasets["train"], 
                eval_dataset=tokenized_datasets["test"],
                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                compute_metrics=eval_metric,
                )

# 模型训练

In [79]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1305,0.314901,0.884095,0.91453
2,0.174,0.331445,0.889891,0.917511
3,0.1187,0.326742,0.884739,0.914066


TrainOutput(global_step=294, training_loss=0.12154225589466744, metrics={'train_runtime': 57.358, 'train_samples_per_second': 324.907, 'train_steps_per_second': 5.126, 'total_flos': 312822818850048.0, 'train_loss': 0.12154225589466744, 'epoch': 3.0})

# 模型评估

In [41]:
trainer.evaluate()

{'eval_loss': 0.47085630893707275,
 'eval_accuracy': 0.8834513844172569,
 'eval_f1': 0.9137684611719866,
 'eval_runtime': 1.9324,
 'eval_samples_per_second': 803.67,
 'eval_steps_per_second': 100.912,
 'epoch': 3.0}

# 模型预测

In [42]:
seq = '我觉得这家酒店不错'
model.eval()
with torch.inference_mode():
    inputs = tokenizer(seq, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    prdict = torch.argmax(logits, dim=-1)
    print(f'输入：{seq} \n 模型预测结果：{prdict}')

输入：我觉得这家酒店不错 
 模型预测结果：tensor([1], device='cuda:0')


In [43]:
from transformers import  pipeline
model.config.id2label = {0: 'negative', 1: 'positive'}
pipe = pipeline("text-classification", model=model ,tokenizer=tokenizer, device=0)
pipe(seq)

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9987572431564331}]