# 文本分类实例

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# 加载数据

In [3]:
import pandas as pd
data = pd.read_csv('../ChnSentiCorp_htl_all.csv')
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [None]:
data.dropna() #删除空数据

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"
...,...,...
7761,0,尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...
7762,0,盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...
7763,0,看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...
7764,0,我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...


# 创建dataset

In [6]:
from torch.utils.data import Dataset

class ClassificationDataset(Dataset):
    def __init__(self, data_path):
        super(ClassificationDataset, self).__init__() 
        self.data = pd.read_csv(data_path)
        self.data = self.data.dropna()
    
    def __getitem__(self, index):
        return self.data.iloc[index]['review'] , self.data.iloc[index]['label']
    
    def __len__(self):
        return len(self.data)

In [7]:
data_path = '../ChnSentiCorp_htl_all.csv'
dataset = ClassificationDataset(data_path)

In [8]:
for i in range(5):
    print(dataset[i])

('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', np.int64(1))
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', np.int64(1))
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', np.int64(1))
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', np.int64(1))
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', np.int64(1))


# 划分数据集

In [9]:
from torch.utils.data import random_split

train_dataset, val_dataset = random_split(dataset, [0.8, 0.2])
print(len(train_dataset), len(val_dataset), len(dataset))

6212 1553 7765


# 创建Dataloader

In [32]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

In [34]:
def collate_fn(batch):
    texts,labels = zip(*batch)
    inputs = tokenizer(texts, padding='max_length',max_length=128 , return_tensors="pt", truncation=True)
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [35]:
from torch.utils.data import DataLoader
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True,collate_fn=collate_fn)
validloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [36]:
next(enumerate(trainloader))

(0,
 {'input_ids': tensor([[ 101, 6821, 3221,  ..., 3121,  857,  102],
         [ 101, 6858, 6814,  ..., 2769, 2697,  102],
         [ 101, 3680, 3613,  ...,    0,    0,    0],
         ...,
         [ 101, 1378, 7305,  ..., 1779, 1259,  102],
         [ 101, 6983, 2421,  ...,    0,    0,    0],
         [ 101, 3302, 1218,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
         1, 1, 1, 1, 0, 1, 1, 1])})

### 创建模型和优化器

In [None]:
from torch.optim import AdamW
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3").cuda()
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练与验证

In [40]:
def evalaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            batch ={k:v.cuda() for k,v in batch.items()}
            outputs = model(**batch)   
            pred = torch.argmax(outputs.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).long().float().sum()
    return acc_num / len(val_dataset)

def train(epochs=5, log_step=10):
    global_step = 0
    for epoch in range(epochs):
        model.train()
        for batch in trainloader:
            batch ={k:v.cuda() for k,v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f"epoch: {epoch}, step: {global_step}, loss: {outputs.loss.item()}")
        acc = evalaluate()
        print(f"epoch: {epoch}, acc: {acc}")


# 模型训练

In [41]:
train()

epoch: 0, step: 10, loss: 0.63782799243927
epoch: 0, step: 20, loss: 0.6574102640151978
epoch: 0, step: 30, loss: 0.5512548089027405
epoch: 0, step: 40, loss: 0.35392865538597107
epoch: 0, step: 50, loss: 0.26514166593551636
epoch: 0, step: 60, loss: 0.558419406414032
epoch: 0, step: 70, loss: 0.544950008392334
epoch: 0, step: 80, loss: 0.2635766267776489
epoch: 0, step: 90, loss: 0.4154847264289856
epoch: 0, step: 100, loss: 0.26677951216697693
epoch: 0, step: 110, loss: 0.15649542212486267
epoch: 0, step: 120, loss: 0.3686162829399109
epoch: 0, step: 130, loss: 0.2954069674015045
epoch: 0, step: 140, loss: 0.27229002118110657
epoch: 0, step: 150, loss: 0.23543420433998108
epoch: 0, step: 160, loss: 0.22024647891521454
epoch: 0, step: 170, loss: 0.2670241594314575
epoch: 0, step: 180, loss: 0.36892426013946533
epoch: 0, step: 190, loss: 0.2929568588733673
epoch: 0, acc: 0.8718609809875488
epoch: 1, step: 200, loss: 0.2824801206588745
epoch: 1, step: 210, loss: 0.3723704218864441
epoch

## 模型预测

In [44]:
seq = '我觉得这家酒店不错'
model.eval()
with torch.inference_mode():
    inputs = tokenizer(seq, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    prdict = torch.argmax(logits, dim=-1)
    print(f'输入：{seq} \n 模型预测结果：{prdict}')

输入：我觉得这家酒店不错 
 模型预测结果：tensor([1], device='cuda:0')


In [46]:
from transformers import  pipeline
model.config.id2label = {0: 'negative', 1: 'positive'}
pipe = pipeline("text-classification", model=model ,tokenizer=tokenizer, device=0)
pipe(seq)

Device set to use cuda:0


[{'label': 'positive', 'score': 0.998591959476471}]