# TextClassification

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
import pandas as pd

data = pd.read_csv('data/waimai_10k.csv')
data = data.dropna()
data

Unnamed: 0,label,review
0,1,很快，好吃，味道足，量大
1,1,没有送水没有送水没有送水
2,1,非常快，态度好。
3,1,方便，快捷，味道可口，快递给力
4,1,菜味道很棒！送餐很及时！
...,...,...
11982,0,以前几乎天天吃，现在调料什么都不放，
11983,0,昨天订凉皮两份，什么调料都没有放，就放了点麻油，特别难吃，丢了一份，再也不想吃了
11984,0,"凉皮太辣,吃不下都"
11985,0,本来迟到了还自己点！！！


## 创建Dataset

In [3]:
from torch.utils.data import Dataset

In [4]:
class MyDataset(Dataset):
    
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv('data/waimai_10k.csv')
        self.data = self.data.dropna()
        
    def __getitem__(self, index):
        return self.data.iloc[index]['review'], self.data.iloc[index]['label']
    
    def __len__(self):
        return len(self.data)

In [5]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

('很快，好吃，味道足，量大', 1)
('没有送水没有送水没有送水', 1)
('非常快，态度好。', 1)
('方便，快捷，味道可口，快递给力', 1)
('菜味道很棒！送餐很及时！', 1)


## 划分数据集

In [6]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, [0.9, 0.1])

In [7]:
for i in range(10):
    print(trainset[i])

('稻香村的点心自然不必多说了，就是好吃！外卖小哥服务也非常好～赞\\(≧▽≦)/！', 1)
('没有按照要求,去掉cheese片', 0)
('等了137分钟，也不知道是不是想让我吃晚饭，公司同事都在工作，就我在吃饭，老板看我的眼神都那样……', 0)
('外卖比在店里吃差很多⋯⋯', 0)
('加盟店吧，太难吃了，城里的魏家凉皮都超好吃，回龙观店是否可以学学再卖，不要毁了招牌', 0)
('点了两瓶啤酒就送了一瓶', 0)
('等了75分钟，我要的星冰乐都变成水了，还有一杯热的焦糖玛奇朵，都变常温了，还撒了好多，真是太次的一次外卖了', 0)
('十一点多订的，等了半个小时打电话那边才确认订单，葱香牛肉焗饭偏咸', 0)
('味道还好，就是送餐太慢了', 0)
('好，味道不错，准时送达。', 1)


## Dataloader

In [8]:
import torch
from torch import nn as nn
if torch.cuda.is_available():
    device = torch.device("cuda")

In [9]:
tokenizer = AutoTokenizer.from_pretrained('rbt3')
def collate_fn(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [10]:
from torch.utils.data import DataLoader

trainloader = DataLoader(trainset, batch_size=64, shuffle=True, collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=8, shuffle=False, collate_fn=collate_fn)

In [11]:
next(enumerate(trainloader))[1].to(device)

{'input_ids': tensor([[ 101, 6820,  679,  ...,    0,    0,    0],
        [ 101, 6887, 5831,  ...,    0,    0,    0],
        [ 101,  676, 1282,  ...,    0,    0,    0],
        ...,
        [ 101, 4649, 6028,  ...,    0,    0,    0],
        [ 101, 6631, 3198,  ...,    0,    0,    0],
        [ 101, 2697, 6230,  ...,    0,    0,    0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 

## 创建模型以及优化器

In [12]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('rbt3', num_labels=2)

if torch.cuda.is_available():
        model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model.parameters

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [14]:
optimizer = Adam(model.parameters(), lr=1e-5)

In [15]:
model(**next(enumerate(trainloader))[1].to(device))

SequenceClassifierOutput(loss=tensor(0.8512, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0237,  0.5955],
        [ 0.1887,  0.4876],
        [ 0.1952,  0.6077],
        [ 0.0017,  0.5220],
        [-0.0675,  0.8001],
        [-0.0060,  0.7063],
        [-0.0817,  0.5675],
        [ 0.0647,  0.7090],
        [ 0.1245,  0.6121],
        [-0.0211,  0.7914],
        [ 0.0588,  0.6941],
        [ 0.1397,  0.5572],
        [ 0.0013,  0.6917],
        [ 0.0799,  0.5033],
        [ 0.0682,  0.5205],
        [-0.0531,  0.5926],
        [ 0.0517,  0.6686],
        [ 0.0510,  0.4480],
        [ 0.0737,  0.4716],
        [ 0.0544,  0.6922],
        [ 0.0240,  0.6625],
        [ 0.0252,  0.6192],
        [ 0.1094,  0.3869],
        [ 0.0724,  0.6521],
        [ 0.1440,  0.5652],
        [ 0.1439,  0.3667],
        [ 0.0897,  0.4054],
        [ 0.1266,  0.5984],
        [-0.0628,  0.7270],
        [-0.0122,  0.6748],
        [ 0.0522,  0.5962],
        [-0.1596,  0.7330],
      

## Model Training

In [16]:
def train(epoch=1, log_step=100):
    
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            ##batch = {k: v.to(device) for k, v in batch.items()} ## 为什么不直接batch.to(device)
            batch.to(device)
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f'epoch: {ep}, global_step: {global_step}, loss: {output.loss.item()}')
            global_step += 1
        acc = evaluate()
        print(f'epoch: {ep}, acc: {acc}')
            
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            ##batch = {k: v.to(device) for k, v in batch.items()}
            batch.to(device)
            output = model(**batch)
            pred = output.logits.argmax(dim=1)
            acc_num += (pred == batch['labels']).sum().item()
    return acc_num / len(validset)

In [17]:
%%time
train()

epoch: 0, global_step: 0, loss: 0.869054913520813
epoch: 0, global_step: 100, loss: 0.3016502857208252
epoch: 0, acc: 0.8839732888146912
CPU times: user 39.8 s, sys: 1.79 s, total: 41.6 s
Wall time: 39 s


In [18]:
sen = '家人们快来吃！'
id2_lable = {0: 'negative', 1: 'positive'}
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors='pt')
    inputs.to(device)
    output = model(**inputs)
    pred = output.logits.argmax(dim=1)
    print(f'input: {sen}, prediction: {id2_lable.get(pred.item())}')

input: 家人们快来吃！, prediction: positive


In [22]:
from transformers import pipeline

model.config.id2label = id2_lable
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=device)

In [23]:
pipe(sen)

[{'label': 'positive', 'score': 0.82942795753479}]