# TextClassification

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

data = pd.read_csv('data/waimai_10k.csv')
data = data.dropna()
data

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！
...,...,...
11982,0,以前几乎天天吃，现在调料什么都不放，
11983,0,昨天订凉皮两份，什么调料都没有放，就放了点麻油，特别难吃，丢了一份，再也不想吃了
11984,0,"凉皮太辣,吃不下都"
11985,0,本来迟到了还自己点！！！


## 创建Dataset

In [3]:
from torch.utils.data import Dataset

In [4]:
class MyDataset(Dataset):
    
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv('data/waimai_10k.csv')
        self.data = self.data.dropna()
        
    def __getitem__(self, index):
        return self.data.iloc[index]['review'], self.data.iloc[index]['label']
    
    def __len__(self):
        return len(self.data)

In [5]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

('很快，好吃，味道足，量大', np.int64(1))
('没有送水没有送水没有送水', np.int64(1))
('非常快，态度好。', np.int64(1))
('方便，快捷，味道可口，快递给力', np.int64(1))
('菜味道很棒！送餐很及时！', np.int64(1))


## 划分数据集

In [6]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, [0.9, 0.1])

In [7]:
for i in range(10):
    print(trainset[i])

('必须要说下，我们7点40订的餐,9点20才到,这家店离我家不远,送货员确很晚才去取餐,晚去了就算了，还不看地图,送到别的地方去了,简直要气死了,超差评！', np.int64(0))
('送得太慢了，都将近两小时', np.int64(0))
('糖醋小排居然是蕃茄酱上的色！极柴！这辈子不会再吃他家的这道菜。牛河里的牛肉不太能确认是否牛肉。菠菜里的花生反而是最好吃的！', np.int64(0))
('味道以及菜品的质量比店内就餐差的太多了', np.int64(0))
('服务很好，昨天吃的，今天补单的。', np.int64(1))
('这么大的风！骑士送来还是热的！速度真心快！', np.int64(1))
('鸡肉有点老，味道不错', np.int64(1))
('还可以不用优惠卷太贵了', np.int64(0))
('量挺多的，味道很不怎么样！', np.int64(0))
('电话没人接，至少迟送了4个小时，以后再也不会买了！幸亏不特别饿，要不会饿死！！！', np.int64(0))


## Dataloader（使用Trainer的情况下不需要Dataloader

In [8]:
import torch
from torch import nn as nn
if torch.cuda.is_available():
    device = torch.device("cuda")

In [9]:
tokenizer = AutoTokenizer.from_pretrained('rbt3')
def collate_fn(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [10]:
from torch.utils.data import DataLoader

trainloader = DataLoader(trainset, batch_size=64, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=8, shuffle=False, collate_fn=collate_fn)

In [11]:
next(enumerate(trainloader))[1].to(device)

{'input_ids': tensor([[ 101, 6843, 7623,  ...,    0,    0,    0],
        [ 101, 2697, 6230,  ...,    0,    0,    0],
        [ 101,  677, 7305,  ...,    0,    0,    0],
        ...,
        [ 101, 1912, 1297,  ...,    0,    0,    0],
        [ 101, 2360,  987,  ...,    0,    0,    0],
        [ 101, 1920,  819,  ...,    0,    0,    0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 

## 使用Datasets组件进行训练数据处理

In [12]:
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
import torch
from torch.utils.data import dataloader

In [13]:
dataset = load_dataset('csv', data_files='data/waimai_10k.csv', split='train')
datset = dataset.filter(lambda x: x['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 11987
})

In [None]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 10788
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1199
    })
})

In [15]:
tokenizer = AutoTokenizer.from_pretrained('rbt3')
def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], truncation=True, max_length=128)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_datsets = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)

Map:   0%|          | 0/10788 [00:00<?, ? examples/s]

Map: 100%|██████████| 10788/10788 [00:00<00:00, 25003.66 examples/s]
Map: 100%|██████████| 1199/1199 [00:00<00:00, 26645.81 examples/s]


In [16]:
trainset, validset = tokenized_datsets['train'], tokenized_datsets['test']
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=32, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

## 创建模型以及优化器

In [17]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('rbt3', num_labels=2)

if torch.cuda.is_available():
        model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
model.parameters

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [19]:
optimizer = Adam(model.parameters(), lr=1e-5)

In [20]:
model(**next(enumerate(trainloader))[1].to(device))

SequenceClassifierOutput(loss=tensor(0.6700, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.6462e-01, -1.6899e-01],
        [ 5.3242e-01, -2.7437e-01],
        [ 5.8368e-01, -7.6947e-02],
        [ 5.4292e-01, -2.8252e-01],
        [ 4.7646e-01, -1.5980e-01],
        [ 4.4355e-01, -5.8920e-02],
        [ 5.1165e-01, -1.8159e-01],
        [ 6.4262e-01, -2.8012e-01],
        [ 5.8919e-01, -1.5995e-01],
        [ 5.8460e-01,  2.3031e-02],
        [ 4.0271e-01, -2.3978e-01],
        [ 4.1833e-01, -4.3947e-02],
        [ 3.6583e-01, -1.5610e-01],
        [ 4.2483e-01, -4.0977e-01],
        [ 4.3035e-01, -4.5206e-02],
        [ 5.0982e-01, -1.5582e-01],
        [ 4.2955e-01, -3.6076e-01],
        [ 4.8478e-01, -2.6383e-02],
        [ 3.9208e-01, -5.3184e-04],
        [ 3.9820e-01, -1.7987e-01],
        [ 4.6233e-01, -2.0679e-01],
        [ 4.9692e-01, -1.7802e-01],
        [ 4.5194e-01, -2.0234e-01],
        [ 6.0594e-01, -2.3995e-01],
        [ 4.8930e-01, -1.8833e-01],
  

In [21]:
model = AutoModelForSequenceClassification.from_pretrained('rbt3', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import evaluate

acc_metrics = evaluate.load('accuracy')
f1_metrics = evaluate.load('f1')

Using the latest cached version of the module from /home/bigorange/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Jan 13 22:20:35 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /home/bigorange/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Mon Jan 13 22:32:14 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


In [23]:
def eval_metrics(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metrics.compute(predictions=predictions, references=labels)
    f1 = f1_metrics.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## 创建Trainer

In [24]:
train_args = TrainingArguments(output_dir='checkpoints')

In [25]:
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_us

In [26]:
trainer = Trainer(model=model, args=train_args,
                  train_dataset=tokenized_datsets['train'],
                  eval_dataset=tokenized_datsets['test'],
                  data_collator = DataCollatorWithPadding(tokenizer),
                  compute_metrics=eval_metrics
                  )

In [27]:
trainer.train()

Step,Training Loss
500,0.3864
1000,0.348
1500,0.3041
2000,0.2552
2500,0.2697
3000,0.215
3500,0.1966
4000,0.199


TrainOutput(global_step=4047, training_loss=0.27135155811880085, metrics={'train_runtime': 144.3816, 'train_samples_per_second': 224.156, 'train_steps_per_second': 28.03, 'total_flos': 272501379307728.0, 'train_loss': 0.27135155811880085, 'epoch': 3.0})

In [28]:
trainer.evaluate()

{'eval_loss': 0.34149298071861267,
 'eval_accuracy': 0.9115929941618015,
 'eval_f1': 0.8637532133676092,
 'eval_runtime': 2.8404,
 'eval_samples_per_second': 422.126,
 'eval_steps_per_second': 52.81,
 'epoch': 3.0}

## Model Training

In [22]:
def train(epoch=1, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            ##batch = {k: v.to(device) for k, v in batch.items()} ## 为什么不直接batch.to(device)
            batch.to(device)
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'epoch: {ep}, global_step: {global_step}, loss: {output.loss.item()}')
            global_step += 1
        acc = evaluate()
        print(f'epoch: {ep}, acc: {acc}')
            
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            ##batch = {k: v.to(device) for k, v in batch.items()}
            batch.to(device)
            output = model(**batch)
            pred = output.logits.argmax(dim=1)
            acc_num += (pred == batch['labels']).sum().item()
    return acc_num / len(validset)

In [23]:
import evaluate

clf_metrics = evaluate.combine(['accuracy', 'f1'])
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            ##batch = {k: v.to(device) for k, v in batch.items()}
            batch.to(device)
            output = model(**batch)
            pred = output.logits.argmax(dim=1)
            clf_metrics.add_batch(predictions=pred.long(), references=batch['labels'].long())
    return clf_metrics.compute()

def train(epoch=1, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            ##batch = {k: v.to(device) for k, v in batch.items()} ## 为什么不直接batch.to(device)
            batch.to(device)
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'epoch: {ep}, global_step: {global_step}, loss: {output.loss.item()}')
            global_step += 1
        clf = evaluate()
        print(f'epoch: {ep}, acc: {clf}')


In [24]:
%%time
train()

epoch: 0, global_step: 0, loss: 0.620169997215271
epoch: 0, global_step: 100, loss: 0.3548542559146881
epoch: 0, global_step: 200, loss: 0.3758608102798462
epoch: 0, global_step: 300, loss: 0.3242703676223755
epoch: 0, acc: {'accuracy': 0.8890742285237698, 'f1': 0.823841059602649}
CPU times: user 33.2 s, sys: 642 ms, total: 33.8 s
Wall time: 1min 27s


In [62]:
sen = '来吃！'
id2_lable = {0: 'negative', 1: 'positive'}
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt')
    inputs.to(device)
    output = model(**inputs)
    pred = output.logits.argmax(dim=1)
    print(f'input: {sen}, prediction: {id2_lable.get(pred.item())}')

input: 来吃！, prediction: negative


In [63]:
from transformers import pipeline

model.config.id2label = id2_lable
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=device)

In [64]:
pipe(sen)

[{'label': 'negative', 'score': 0.8409010171890259}]