# Fine-tuning BERT-2 [品類分類，多標籤]




Fintech Project

Author: 楊晴雯 

Date: 2022.5.10 



In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# maindir = '/content/drive/MyDrive/FinTech-final-project'
# datadir = f'{maindir}/data'
# spmdir = f'{maindir}/spm'
# modeldir = f'{maindir}/models'
# cat_df_path = f'{maindir}/東吳課程_發票資料集/品類資料集/cat_train_v2.csv'
datadir = './'

In [2]:
# !pip -q install datasets

In [3]:
import pickle
from dataclasses import dataclass
import torch
import random
import torch 
import os
import torch.nn as nn
import torch.nn.functional as F

In [4]:
from datasets import Dataset, load_metric
import datasets
import joblib, sys
import numpy as np
import pandas as pd

In [5]:
checkpoint_path = './checkpoints/0510-0939/model.ckpt'
datasets_path = './dataset/encoded_dataset_30.0'
encoded_dataset = datasets.load_from_disk(datasets_path)

## Reload BERT checkpoint 

In [6]:
cat2idx = joblib.load(f'{datadir}/category/cat2idx.pkl')
idx2cat = {v:k for k, v in cat2idx.items()}

In [7]:
MODELNAME = 'bert-base-chinese'
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(MODELNAME)
N = len(idx2cat)

In [8]:
model = torch.load(checkpoint_path)

In [9]:
model # A BertForSequenceClassification model based on bert-base-chinese

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
def collate_fn(batch):
    keys = batch[0].keys()
    # print(keys)
    # 'id', 'name', 'label', 'seg_name', 'label_name', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'
    to_tensor = ['input_ids', 'token_type_ids', 'attention_mask']
    new_batch = {k:[d[k] for d in batch] for k in keys if k in to_tensor}
    new_batch = tokenizer.pad(
        new_batch,
        padding= "max_length",
        max_length= 100,
        return_tensors="pt",
    )
    label = [d['label'] for d in batch]
    ids = [d['id'] for d in batch]
    new_batch['label'] = torch.tensor(label, dtype=torch.float32)
    new_batch["id"] = torch.tensor(ids, dtype=torch.int64)
    return new_batch

In [11]:
extra_epochs = 15
# the loaded model has been trained for 20 epochs already, and reaches: 
# VAL F1 macro: 0.529, weighted: 0.809
# now we want to train more and train with different accuracy computation method
# that is to set the zero_division value to 1 in f1_score() for those missed classes so that the result is more properly 
# evaluated 

batch_size = 200
lr = 1e-5
myseed = 1027 # 43 # 1123
gast = 1
wmst = 20
weight_decay = 0.05

In [12]:
def seeding(myseed):
    torch.manual_seed(myseed)
    torch.cuda.manual_seed(myseed)
    torch.cuda.manual_seed_all(myseed)
    np.random.seed(myseed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seeding(myseed)

## Dataset to Dataloader
Note that the dataset is saved from 30.0 script, otherwise we'll have train test splits mixed up in further training.

In [13]:
from torch.utils.data import DataLoader
train_loader = DataLoader(encoded_dataset['train'], batch_size = batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(encoded_dataset['test'], batch_size = batch_size, collate_fn=collate_fn, shuffle = False)

## Preparation for training loop 

In [14]:
weight_decay = 0.05
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
                              weight_decay = weight_decay, 
                              lr = lr)

In [15]:
# !pip -q install wandb

In [16]:
import wandb
from datetime import datetime
import os
wandb.login()
now = datetime.now()
runname = now.strftime("%m%d-%H%M")

[34m[1mwandb[0m: Currently logged in as: [33mnana2929[0m (use `wandb login --relogin` to force relogin)


In [24]:
wandb.init(project = "Fintech Project",
           entity="nana2929", 
           name = checkpoint_path.split('/')[-2],  #'0510-0939'
           config = {
               "batch_size": batch_size,
               "dataset": "麻布數據dataset1.0",
               "learning_rate": lr,
               "epochs": extra_epochs, 
               "note": 'This is the further training on 0510-0939 checkpoint'
           })




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train loss,▁
train macro f1,▁
train weighted f1,▁
val loss,▁
val macro f1,▁
val weighted f1,▁

0,1
train loss,4.54479
train macro f1,0.5333
train weighted f1,0.81856
val loss,4.55112
val macro f1,0.54818
val weighted f1,0.80938


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
!nvidia-smi

Tue May 10 11:26:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 495.46       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:3E:00.0  On |                  Off |
| 30%   40C    P2    62W / 230W |   1951MiB / 24248MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Complete train loop
- With logging, metrics, report to wandb
- colab上gpu記憶體不夠，得挪到server上訓練
- batch_size開1000會太大，可嘗試200~800的範圍

In [20]:
from sklearn.metrics import f1_score
def compute_accuracy(labels, preds, zd = 1):
    '''
    REVISION WARNING:
    Different from 30.0 where we set zero_division to 0 for those classes missed in validation set, 
    we now use 1 for computation of averaged f1. 
    
    F1-score should be computed by the complete validation set! 
    Not by batch, SO BE SURE TO PASS IN two (val_size, nclass) arrays for inputs
    '''
    
    preds = np.where(preds <= 0.5, 0, 1)
    return {'macro f1': f1_score(labels, preds, average = 'macro', zero_division = zd), 
            'weighted f1': f1_score(labels, preds, average = 'weighted', zero_division = zd)}

In [21]:
# y_ = np.array([0, 0.9, 2, 1, 0.1, 0.2]) <- simulate output predictions 
# y_ = np.where(y_ <= 0.5, 0, 1) 
# y = np.array([1, 1, 1, 1, 0, 0]) <- simulate real labels
# f1_score(y, y_, average= 'macro')

In [25]:
# %%wandb
from tqdm import tqdm

model.to(device)
max_val_acc = -1


# the already-trained epoch number
base = 20 

for epoch in range(extra_epochs):
    model.train()
    epoch_loss = {'train': 0.0, 
                  'val': 0.0}
    val_pairs = {'preds': [], 
                'labels': []}
    train_pairs = {'preds': [],
                  'labels': []}
    model.train()
    
    for id, batch in enumerate(tqdm(train_loader)):
        inputs = batch['input_ids'].to(device)
        attnmasks = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        logits = model(inputs, attnmasks).logits
        
        preds = F.softmax(logits, dim = 1)
        loss = criterion(preds, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # storing results 
        epoch_loss['train'] += loss.item()
        
        preds = preds.detach().cpu()
        labels = labels.detach().cpu()
        train_pairs['labels'].append(labels)
        train_pairs['preds'].append(preds)
        
    
    # Note that we need to compute by dataset instead of by batch
    train_preds = torch.vstack(train_pairs['preds'])
    train_labels = torch.vstack(train_pairs['labels'])
    
    train_acc = compute_accuracy(preds = train_preds, 
                                    labels = train_labels)
    
    model.eval()
    with torch.no_grad():
        for id, batch in enumerate(val_loader):
            inputs = batch['input_ids'].to(device)
            attnmasks = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits = model(inputs, attnmasks).logits
            # storing results 
            preds = F.softmax(logits, dim = 1)
            loss = criterion(preds, labels)
            
            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            val_pairs['labels'].append(labels)
            val_pairs['preds'].append(preds)
            epoch_loss['val'] += loss.item()
        
        # Note that we need to compute by dataset instead of by batch
        val_preds = np.vstack(val_pairs['preds'])
        val_labels = np.vstack(val_pairs['labels'])
        val_acc = compute_accuracy(labels = val_labels, 
                                       preds = val_preds)
    wandb.log({
        'train macro f1': train_acc['macro f1'],
        'train weighted f1': train_acc['weighted f1'],
        'train loss': epoch_loss['train']/len(train_loader),
        'val macro f1': val_acc['macro f1'],
        'val weighted f1':val_acc['weighted f1'],
        'val loss': epoch_loss['val']/len(val_loader)
        })
    print(f"[{epoch+1+base}/{extra_epochs+base}] |TRAIN loss: {epoch_loss['train']/len(train_loader):.3f} |VAL loss:{epoch_loss['val']/len(val_loader):.3f},\
          \n         VAL F1 macro: {val_acc['macro f1']:.3f}, weighted: {val_acc['weighted f1']:.3f}")
    if val_acc['macro f1'] > max_val_acc:
        torch.save(model, checkpoint_path)
        print('🏆 Saving model!')
        max_val_acc = val_acc['macro f1']
        best_val_pairs  = val_pairs
        print('===================================')

100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:02<00:00,  2.03it/s]


[21/35] |TRAIN loss: 4.541 |VAL loss:4.545,          
         VAL F1 macro: 0.553, weighted: 0.816
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:05<00:00,  2.00it/s]


[22/35] |TRAIN loss: 4.533 |VAL loss:4.537,          
         VAL F1 macro: 0.570, weighted: 0.826
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.00it/s]


[23/35] |TRAIN loss: 4.532 |VAL loss:4.538,          
         VAL F1 macro: 0.571, weighted: 0.826
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[24/35] |TRAIN loss: 4.529 |VAL loss:4.533,          
         VAL F1 macro: 0.578, weighted: 0.833
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.00it/s]


[25/35] |TRAIN loss: 4.525 |VAL loss:4.528,          
         VAL F1 macro: 0.588, weighted: 0.839
🏆 Saving model!


 89%|███████████████████████████████████████████████████████████████████████▊         | 328/370 [02:43<00:21,  1.98it/s]wandb: Network error (ReadTimeout), entering retry loop.
100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.01it/s]


[26/35] |TRAIN loss: 4.523 |VAL loss:4.527,          
         VAL F1 macro: 0.600, weighted: 0.842
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[27/35] |TRAIN loss: 4.520 |VAL loss:4.523,          
         VAL F1 macro: 0.610, weighted: 0.847
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.01it/s]


[28/35] |TRAIN loss: 4.516 |VAL loss:4.520,          
         VAL F1 macro: 0.617, weighted: 0.850
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.01it/s]


[29/35] |TRAIN loss: 4.516 |VAL loss:4.519,          
         VAL F1 macro: 0.619, weighted: 0.852
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[30/35] |TRAIN loss: 4.515 |VAL loss:4.518,          
         VAL F1 macro: 0.620, weighted: 0.853
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[31/35] |TRAIN loss: 4.513 |VAL loss:4.518,          
         VAL F1 macro: 0.617, weighted: 0.853


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.01it/s]


[32/35] |TRAIN loss: 4.515 |VAL loss:4.522,          
         VAL F1 macro: 0.615, weighted: 0.850


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:04<00:00,  2.01it/s]


[33/35] |TRAIN loss: 4.513 |VAL loss:4.518,          
         VAL F1 macro: 0.620, weighted: 0.854
🏆 Saving model!


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[34/35] |TRAIN loss: 4.512 |VAL loss:4.517,          
         VAL F1 macro: 0.620, weighted: 0.854


100%|█████████████████████████████████████████████████████████████████████████████████| 370/370 [03:03<00:00,  2.01it/s]


[35/35] |TRAIN loss: 4.512 |VAL loss:4.516,          
         VAL F1 macro: 0.629, weighted: 0.856
🏆 Saving model!


## Classification Report 

In [26]:
from sklearn.metrics import classification_report
cat2idx = joblib.load(f'{datadir}/category/cat2idx.pkl')
labels = sorted(cat2idx.items(), key = lambda x:x[1])
keys = [x[0] for x in labels]
preds = np.where(np.vstack(best_val_pairs['preds']) <= 0.5, 0, 1)
labels = np.vstack(best_val_pairs['labels'])
ZD = 1 
print(classification_report(labels, 
                            preds, target_names= keys, zero_division = ZD))

              precision    recall  f1-score   support

        人工淚液       0.95      0.95      0.95        21
        中式香腸       0.71      1.00      0.83        57
         化妝水       0.98      0.98      0.98       173
        成人牙膏       0.85      1.00      0.92       403
      水路/健行鞋       1.00      1.00      1.00         0
         火鍋料       0.98      1.00      0.99        57
          奶瓶       0.98      0.98      0.98        44
        巧拼地墊       1.00      0.00      0.00         4
        平板電腦       0.95      0.62      0.75        34
       筆記型電腦       0.83      0.92      0.87        48
       智慧型手機       0.85      0.93      0.89       148
    瓦斯爐(廚房用)       1.00      0.00      0.00        14
    瓦斯爐(攜帶式)       0.50      1.00      0.67        13
       甲片/甲貼       0.98      1.00      0.99        41
         甲油膠       1.00      0.00      0.00         3
          冰箱       1.00      1.00      1.00        38
        安全汽座       0.87      0.93      0.90        14
        成人牙刷       0.74    