In [2]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:8889"
os.environ["https_proxy"] = "http://127.0.0.1:8889"

## Step-1 Import

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## Step-2 Load data

In [4]:
import pandas as pd

data = pd.read_csv("./datasets/ChnSentiCorp_htl_all.csv")
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [5]:
print(len(data))
data = data.dropna()
print(len(data))

7766
7765


## Step-3 Create dataset

In [6]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("./datasets/ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()
        
    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]
    
    def __len__(self):
        return len(self.data)



In [7]:
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', 1)
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', 1)
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', 1)
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', 1)


## Step-4 Split dataset

In [8]:
from torch.utils.data import random_split

train_set, valid_set = random_split(dataset, lengths=[0.9, 0.1])
len(train_set), len(valid_set)

(6989, 776)

In [9]:
for i in range(5):
    print(train_set[i])

('地理位置不错，就是临街有点吵，其他都不错', 1)
('还是房价贵了点，如果房价在200就可以了。', 0)
('房间4分设施齐备房间大而舒适某些破旧的地方没有修补；环境5分酒店离大马路有段距离晚上几乎听不到噪音地处沙面环境方面在广州能给5分了；服务3分服务印象中规中矩。评定4分另外：可以免费停车但是停车位置过于拥挤；标间3个月前296元现在31X元价钱还算在合理范围但是涨价是不是也需要理由呢', 1)
('总体来说很不错.就是地理位置并不是想象中的好.虽然也被列入陆家嘴附近.虽然服务人员说离正大广场很近.可是还是相差很大的.总体还是很不错的,相对于其他住酒店的经历.', 1)
('不能够提供按照预定的房间要求，明明是要求两间大床房，当天到了之后才告知没有，而且只给了一个标准间，并且最开始连冷气都是坏的，北京40度的高温，不知道这里到底是酒店还是桑拿房', 0)


## Step-5 Create DataLoader

In [14]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def collect_func(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    inputs["labels"] = torch.tensor(labels)
    return inputs

In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collect_func)
valide_loader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=collect_func)

In [16]:
next(enumerate(train_loader))[1]

{'input_ids': tensor([[ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101,  671,  702,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        ...,
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 2769,  812,  ..., 5500, 1456,  102],
        [ 101,  676,  782,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1])}

## Step-6 Create Model and Optimizer

In [18]:
from torch.optim import AdamW 

model = AutoModelForSequenceClassification.from_pretrained("./download_models/hfl/rbt3")
if torch.cuda.is_available():
    model = model.cuda()

optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./download_models/hfl/rbt3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step-7 Train and Evaluate

In [19]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in valide_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(valid_set)
        
def train(epoch=5, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"epoch {ep} global step {global_step} loss {output.loss.item()}")
            global_step += 1
        
        acc = evaluate()
        print(f"epoch {ep} acc {acc}")

## Step-8 Train

In [20]:
train()

epoch 0 global step 0 loss 0.6037129163742065
epoch 0 global step 100 loss 0.250297486782074
epoch 0 global step 200 loss 0.28571635484695435
epoch 0 acc 0.8646907210350037
epoch 1 global step 300 loss 0.38009151816368103
epoch 1 global step 400 loss 0.09890727698802948
epoch 1 acc 0.8917525410652161
epoch 2 global step 500 loss 0.2233220487833023
epoch 2 global step 600 loss 0.20649364590644836
epoch 2 acc 0.907216489315033
epoch 3 global step 700 loss 0.0919189602136612
epoch 3 global step 800 loss 0.033617015928030014
epoch 3 acc 0.8981958627700806
epoch 4 global step 900 loss 0.1589212417602539
epoch 4 global step 1000 loss 0.08339783549308777
epoch 4 acc 0.894329845905304



## Step-9 Test

In [25]:
sen = "这家酒店的环境非常好，价格也便宜，值得推荐"

id2label = {0: "neg", 1: "pos"}

with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    output = model(**inputs)
    pred = torch.argmax(output.logits, dim=-1)
    print(id2label[pred.item()])
    

pos


In [27]:
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

pipe(sen)

[{'label': 'pos', 'score': 0.9995274543762207}]

## Step-10 Save and load

In [28]:
model.save_pretrained("./save_models/rbt3")

model = AutoModelForSequenceClassification.from_pretrained("./save_models/rbt3")