# Chinese Sentiment Classification

In [None]:
!git clone https://github.com/GitYCC/bert-minimal-tutorial.git

Cloning into 'bert-minimal-tutorial'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 26 (delta 8), reused 18 (delta 3), pack-reused 0[K
Unpacking objects: 100% (26/26), done.


In [None]:
%cd bert-minimal-tutorial

/content/bert-minimal-tutorial


In [None]:
!pip install -q -r requirements.txt

[K     |████████████████████████████████| 829kB 12.3MB/s 
[K     |████████████████████████████████| 1.3MB 53.6MB/s 
[K     |████████████████████████████████| 512kB 50.3MB/s 
[K     |████████████████████████████████| 727kB 45.0MB/s 
[K     |████████████████████████████████| 71kB 11.1MB/s 
[K     |████████████████████████████████| 890kB 28.9MB/s 
[K     |████████████████████████████████| 1.1MB 45.1MB/s 
[K     |████████████████████████████████| 2.9MB 56.4MB/s 
[K     |████████████████████████████████| 1.3MB 54.1MB/s 
[K     |████████████████████████████████| 133kB 57.0MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: google-colab 1.0.0 has requirement requests~=2.23.0, but you'll have requests 2.24.0 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m


In [None]:
import os

import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm.notebook import tqdm

from utils import RunningAverage

MODEL_NAME = 'bert-base-chinese'
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)

## Dataloader

In [None]:
df = pd.read_csv('data/chinese_sentiment_classification.csv')
df = df.sample(frac=1).reset_index(drop=True)  # shuffle

In [None]:
df

Unnamed: 0,label,text
0,1,"性價比高，外觀不錯,空間很不錯夠寬敞。在當時這款車兩側還有防撞梁挺不錯的。"
1,0,有好多小毛病，都是些無關痛癢的
2,1,最滿意致悅的車身尺寸，這個兩廂車2708mm的軸距，185cm的車寬，不會顯的小氣，非常適合...
3,0,動力是短版，不過畢竟是1.5自然吸氣的發動機，夠用
4,1,小三外觀真的沒話說，挺漂亮的，尤其前臉，那個大嘴很好看，開起來還是很順暢的，尤其在高速上，在...
...,...,...
69995,1,氙氣大燈給力！車外形沒得說！後備箱大大大
69996,0,暫時沒有什麼不滿意的，自己挑選的車
69997,0,作為一款中型suv後備箱空間不足，三個小行李箱幾乎塞滿。吝嗇的鹵素大燈，蠟燭光亮名符其實。
69998,0,門側有異響，是橡膠條的，希望時間長一點就能解決，4s的哥們是這樣說的，新車緣故


In [None]:
class MultiClassDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512, for_train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.for_train = for_train

        self.texts = []
        self.labels = []
        for _, row in df.iterrows():
            self.texts.append(row['text'])
            if for_train:
                self.labels.append(row['label'])

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer.tokenize(text)
        tokens = tokens[:self.max_len-2]
        processed_tokens = ['[CLS]'] + tokens + ['[SEP]']

        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(processed_tokens))
        token_type_ids = torch.tensor([0] * len(processed_tokens))
        attention_mask = torch.tensor([1] * len(processed_tokens))

        outputs = (input_ids, token_type_ids, attention_mask)

        if self.for_train:
            label = self.labels[idx]
            label = torch.tensor(label)
            outputs += (label, )

        return outputs

    def __len__(self):
        return len(self.texts)

    def create_mini_batch(self, samples):
        outputs = list(zip(*samples))

        # zero pad 到同一序列長度
        input_ids = pad_sequence(outputs[0], batch_first=True)
        token_type_ids = pad_sequence(outputs[1], batch_first=True)
        attention_mask = pad_sequence(outputs[2], batch_first=True)

        batch_output = (input_ids, token_type_ids, attention_mask)
    
        if self.for_train:
            labels = torch.stack(outputs[3])
            batch_output += (labels, )

        return batch_output

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

dataset = MultiClassDataset(tokenizer, df)

CUT_RATIO = 0.8
train_size = int(CUT_RATIO * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

In [None]:
batch_size = 32

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
    shuffle=True
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
)

## Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels = 2,
    return_dict=True
)
model.to(device)

device: cuda


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Train

In [None]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, valid_loader, device):
    model.eval()

    loss = RunningAverage()
    acc = RunningAverage()

    with torch.no_grad():
        for data in tqdm(valid_loader, desc='evaluate'):
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss.add(outputs.loss.item())
            corrects = (outputs.logits.argmax(dim=-1) == labels).cpu().tolist()
            acc.add_all(corrects)

    return loss.get(), acc.get()

In [None]:
lr = 0.00001
max_iter = 200
show_per_iter = 10
valid_per_iter = 50
save_per_iter = 100
save_checkpoint_dir = 'models/'
model_prefix = 'cn_sentiment_class_'

assert save_per_iter % valid_per_iter == 0

optimizer = optim.Adam(model.parameters(), lr=lr)

i = 1
is_running = True
train_loss = RunningAverage()
model_paths = []
while is_running:
    for train_data in train_loader:
        loss = train_batch(model, train_data, optimizer, device)
        train_loss.add(loss)

        if i % show_per_iter == 0:
            print('train [{}]: loss={}'.format(i, train_loss.get()))
            train_loss.flush()

        if i % valid_per_iter == 0:
            loss, acc = evaluate(model, valid_loader, device)
            print(f'valid: loss={loss}, acc={acc}')

        if i % save_per_iter == 0:
            path = os.path.join(save_checkpoint_dir, model_prefix + f'loss{loss:.5}/')
            print(f'save model at {path}')
            model.save_pretrained(path)
            model_paths.append(path)
        
        if i == max_iter:
            is_running = False
            break

        i += 1

train [10]: loss=0.7181851685047149
train [20]: loss=0.5864066839218139
train [30]: loss=0.42693847715854644
train [40]: loss=0.24774658530950547


evaluate:   0%|          | 1/438 [00:00<01:12,  6.00it/s]

train [50]: loss=0.16719756051898002


evaluate: 100%|██████████| 438/438 [01:12<00:00,  6.02it/s]


valid: loss=0.15826657107397574, acc=0.9472142857142857
train [60]: loss=0.20298943296074867
train [70]: loss=0.1366733867675066
train [80]: loss=0.13427007235586644
train [90]: loss=0.1230307761579752


evaluate:   0%|          | 1/438 [00:00<01:23,  5.23it/s]

train [100]: loss=0.11760147921741008


evaluate: 100%|██████████| 438/438 [01:15<00:00,  5.78it/s]


valid: loss=0.1247269513200305, acc=0.9582857142857143
save model at models/cn_sentiment_class_loss0.12473/
train [110]: loss=0.1479931315407157
train [120]: loss=0.16510998792946338
train [130]: loss=0.17600416839122773
train [140]: loss=0.13213363960385321


evaluate:   0%|          | 1/438 [00:00<01:18,  5.54it/s]

train [150]: loss=0.09483904615044594


evaluate: 100%|██████████| 438/438 [01:17<00:00,  5.66it/s]


valid: loss=0.10906191855949694, acc=0.9624285714285714
train [160]: loss=0.11372225657105446
train [170]: loss=0.08925026133656502
train [180]: loss=0.11515358416363597
train [190]: loss=0.11444938695058227


evaluate:   0%|          | 1/438 [00:00<01:20,  5.40it/s]

train [200]: loss=0.13236291687935592


evaluate: 100%|██████████| 438/438 [01:16<00:00,  5.70it/s]


valid: loss=0.0945605286737249, acc=0.9663571428571428
save model at models/cn_sentiment_class_loss0.094561/


## Predict

In [None]:
reload_checkpoint = model_paths[-1]

examples = [
    '板金的部分我覺得很脆弱',
    '整體外殼造型我是喜歡的'
]
examples_df = pd.DataFrame(data={'text': examples})

pred_dataset = MultiClassDataset(tokenizer, examples_df, for_train=False)

pred_loader = DataLoader(
    dataset=pred_dataset,
    batch_size=batch_size,
    collate_fn=pred_dataset.create_mini_batch,
)

model = BertForSequenceClassification.from_pretrained(reload_checkpoint)
model.to(device)

pred_labels = []
with torch.no_grad():
    for data in tqdm(pred_loader, desc='predict'):
        input_ids, token_type_ids, attention_mask = [d.to(device) for d in data]

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

        pred_labels += outputs.logits.argmax(dim=-1).cpu().tolist()

print('predict result: ', list(zip(examples, pred_labels)))

predict: 100%|██████████| 1/1 [00:00<00:00, 59.56it/s]

predict result:  [('板金的部分我覺得很脆弱', 0), ('整體外殼造型我是喜歡的', 1)]



