# Chinese Sentiment Classification

In [1]:
!git clone https://github.com/GitYCC/bert-minimal-tutorial.git

Cloning into 'bert-minimal-tutorial'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 124 (delta 65), reused 62 (delta 19), pack-reused 0[K
Receiving objects: 100% (124/124), 38.88 MiB | 14.73 MiB/s, done.
Resolving deltas: 100% (65/65), done.


In [2]:
%cd bert-minimal-tutorial

/content/bert-minimal-tutorial


In [3]:
!pip install -q -r requirements.txt

[?25l[K     |█▍                              | 10kB 30.5MB/s eta 0:00:01[K     |██▉                             | 20kB 33.7MB/s eta 0:00:01[K     |████▎                           | 30kB 38.2MB/s eta 0:00:01[K     |█████▊                          | 40kB 26.8MB/s eta 0:00:01[K     |███████▏                        | 51kB 15.4MB/s eta 0:00:01[K     |████████▋                       | 61kB 14.7MB/s eta 0:00:01[K     |██████████                      | 71kB 6.2MB/s eta 0:00:01[K     |███████████▍                    | 81kB 6.7MB/s eta 0:00:01[K     |████████████▉                   | 92kB 7.0MB/s eta 0:00:01[K     |██████████████▎                 | 102kB 7.5MB/s eta 0:00:01[K     |███████████████▊                | 112kB 7.5MB/s eta 0:00:01[K     |█████████████████▏              | 122kB 7.5MB/s eta 0:00:01[K     |██████████████████▋             | 133kB 7.5MB/s eta 0:00:01[K     |████████████████████            | 143kB 7.5MB/s eta 0:00:01[K     |█████████████████████

In [5]:
import os

import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm.notebook import tqdm

from utils import RunningAverage

MODEL_NAME = 'bert-base-chinese'
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## Dataloader

In [6]:
df = pd.read_csv('data/chinese_sentiment_classification.csv')
df = df.sample(frac=1).reset_index(drop=True)  # shuffle

In [7]:
df

Unnamed: 0,label,text
0,1,回頭率很高，全景天窗很舒服，空間很大。
1,0,油耗高！小毛病多！比如左側玻璃升降問題。最近聽說北汽紳寶把x65給威旺了？我很是接受不了。早...
2,1,最滿意的就是外觀，完美。買車就是衝著它的外觀來的。
3,1,性價比很高，助力轉向很實用。
4,0,前輪剎車粉掉的很厲害，噪音大點
...,...,...
69995,1,外形穩重不失霸氣，中控集成大屏操作，設計簡約而不簡單，小方向盤操控精准，全景天幕好迷人。。。...
69996,1,車子外觀過得去
69997,0,主駕儲物空間少少少少少！油門有遲鈍
69998,1,空間大，能裝人，能載貨，超控好，過彎有信心，動力足，提速很快，超級運動模式，四驅性能好，水平...


In [8]:
class MultiClassDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512, for_train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.for_train = for_train

        self.texts = []
        self.labels = []
        for _, row in df.iterrows():
            self.texts.append(row['text'])
            if for_train:
                self.labels.append(row['label'])

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer.tokenize(text)
        tokens = tokens[:self.max_len-2]
        processed_tokens = ['[CLS]'] + tokens + ['[SEP]']

        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(processed_tokens))
        token_type_ids = torch.tensor([0] * len(processed_tokens))
        attention_mask = torch.tensor([1] * len(processed_tokens))

        outputs = (input_ids, token_type_ids, attention_mask)

        if self.for_train:
            label = self.labels[idx]
            label = torch.tensor(label)
            outputs += (label, )

        return outputs

    def __len__(self):
        return len(self.texts)

    def create_mini_batch(self, samples):
        outputs = list(zip(*samples))

        # zero pad 到同一序列長度
        input_ids = pad_sequence(outputs[0], batch_first=True)
        token_type_ids = pad_sequence(outputs[1], batch_first=True)
        attention_mask = pad_sequence(outputs[2], batch_first=True)

        batch_output = (input_ids, token_type_ids, attention_mask)
    
        if self.for_train:
            labels = torch.stack(outputs[3])
            batch_output += (labels, )

        return batch_output

In [9]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

dataset = MultiClassDataset(tokenizer, df)

CUT_RATIO = 0.8
train_size = int(CUT_RATIO * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [10]:
batch_size = 32

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
    shuffle=True
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
)

## Model

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels = 2,
    return_dict=True
)
model.to(device)

device: cuda


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Train

In [12]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, valid_loader, device):
    model.eval()

    loss_averager = RunningAverage()
    acc_averager = RunningAverage()

    with torch.no_grad():
        for data in tqdm(valid_loader, desc='evaluate'):
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss_averager.add(outputs.loss.item())
            corrects = (outputs.logits.argmax(dim=-1) == labels).cpu().tolist()
            acc_averager.add_all(corrects)

    return loss_averager.get(), acc_averager.get()

In [13]:
lr = 0.00001
max_iter = 200
show_per_iter = 10
valid_per_iter = 50
save_per_iter = 100
save_checkpoint_dir = 'models/'
model_prefix = 'cn_sentiment_class_'

assert save_per_iter % valid_per_iter == 0

optimizer = optim.Adam(model.parameters(), lr=lr)

i = 1
is_running = True
train_loss_averager = RunningAverage()
model_paths = []
while is_running:
    for train_data in train_loader:
        loss = train_batch(model, train_data, optimizer, device)
        train_loss_averager.add(loss)

        if i % show_per_iter == 0:
            print('train [{}]: loss={}'.format(i, train_loss_averager.get()))
            train_loss_averager.flush()

        if i % valid_per_iter == 0:
            loss, acc = evaluate(model, valid_loader, device)
            print(f'valid: loss={loss}, acc={acc}')

        if i % save_per_iter == 0:
            path = os.path.join(save_checkpoint_dir, model_prefix + f'loss{loss:.5}/')
            print(f'save model at {path}')
            model.save_pretrained(path)
            model_paths.append(path)
        
        if i == max_iter:
            is_running = False
            break

        i += 1

train [10]: loss=0.5893517285585403
train [20]: loss=0.3839615434408188
train [30]: loss=0.24068916216492653
train [40]: loss=0.20345594882965087
train [50]: loss=0.14611995443701745


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=438.0, style=ProgressStyle(description_wid…


valid: loss=0.15193769778018673, acc=0.9498571428571428
train [60]: loss=0.1359931267797947
train [70]: loss=0.17350330725312232
train [80]: loss=0.12257440350949764
train [90]: loss=0.13735962174832822
train [100]: loss=0.11625733524560929


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=438.0, style=ProgressStyle(description_wid…


valid: loss=0.13351879925700968, acc=0.9510714285714286
save model at models/cn_sentiment_class_loss0.13352/
train [110]: loss=0.11457035839557647
train [120]: loss=0.15438417233526708
train [130]: loss=0.09068668149411678
train [140]: loss=0.10498005840927363
train [150]: loss=0.12269100025296212


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=438.0, style=ProgressStyle(description_wid…


valid: loss=0.09959469168004687, acc=0.9643571428571428
train [160]: loss=0.10473273396492004
train [170]: loss=0.11633298993110656
train [180]: loss=0.11224766038358211
train [190]: loss=0.10662103854119778
train [200]: loss=0.06498785391449928


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=438.0, style=ProgressStyle(description_wid…


valid: loss=0.09139078304588319, acc=0.9680714285714286
save model at models/cn_sentiment_class_loss0.091391/


## Predict

In [14]:
reload_checkpoint = model_paths[-1]

examples = [
    '板金的部分我覺得很脆弱',
    '整體外殼造型我是喜歡的'
]
examples_df = pd.DataFrame(data={'text': examples})

pred_dataset = MultiClassDataset(tokenizer, examples_df, for_train=False)

pred_loader = DataLoader(
    dataset=pred_dataset,
    batch_size=batch_size,
    collate_fn=pred_dataset.create_mini_batch,
)

model = BertForSequenceClassification.from_pretrained(reload_checkpoint)
model.to(device)

pred_labels = []
with torch.no_grad():
    for data in tqdm(pred_loader, desc='predict'):
        input_ids, token_type_ids, attention_mask = [d.to(device) for d in data]

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

        pred_labels += outputs.logits.argmax(dim=-1).cpu().tolist()

print('predict result: ', list(zip(examples, pred_labels)))

HBox(children=(FloatProgress(value=0.0, description='predict', max=1.0, style=ProgressStyle(description_width=…


predict result:  [('板金的部分我覺得很脆弱', 0), ('整體外殼造型我是喜歡的', 1)]
