## 40.0 Post-hoc

    Author: Nana
    Date: 2022.5.15
1. Run inference on valid set, save the softmaxed probability vectors.
2. Identify the categories with low f1 score.
3. Run fasttext keywords string matching; add up the probabilites of the matched keywords
and re-select the classes based on new probability vector.
    

In [1]:
import joblib, sys
import numpy as np
import pandas as pd
import random
import torch 
import os
import torch.nn as nn
import torch.nn.functional as F
checkpoint_path = './checkpoints/0510-0939/model.ckpt'
dataset_path = './dataset/encoded_dataset_30.0'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import datasets
from datasets import Dataset, load_metric

In [4]:
dataset = datasets.load_from_disk(dataset_path)
valset = dataset['test']

In [5]:
# load the fine-tuned model for inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [123]:
model = torch.load(checkpoint_path)
model.to(device)
print()




In [4]:
!nvidia-smi

Sat May 14 13:17:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 495.46       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:3E:00.0  On |                  Off |
| 30%   33C    P8    17W / 230W |     43MiB / 24248MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [56]:
MODELNAME = 'bert-base-chinese'
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(MODELNAME)
def collate_fn(batch):
    keys = batch[0].keys()
    # print(keys)
    # 'id', 'name', 'label', 'seg_name', 'label_name', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'
    to_tensor = ['input_ids', 'token_type_ids', 'attention_mask']
    new_batch = {k:[d[k] for d in batch] for k in keys if k in to_tensor}
    new_batch = tokenizer.pad(
        new_batch,
        padding= "max_length",
        max_length= 100,
        return_tensors="pt",
    )
    label = [d['label'] for d in batch]
    ids = [d['id'] for d in batch]
    new_batch['label'] = torch.tensor(label, dtype=torch.float32)
    new_batch["id"] = torch.tensor(ids, dtype=torch.int64)
    return new_batch

In [41]:
from torch.utils.data import DataLoader
batch_size = 20
val_loader = DataLoader(valset, 
                        batch_size = batch_size, 
                        collate_fn=collate_fn, 
                        shuffle = False)

In [113]:
from sklearn.metrics import f1_score, classification_report
def compute_accuracy(labels, preds, threshold = 0.5, zd = 1):
    preds = np.where(preds <= threshold, 0, 1)
    return {'macro f1': f1_score(labels, preds, average = 'macro', zero_division = zd), 
            'weighted f1': f1_score(labels, preds, average = 'weighted', zero_division = zd)}

In [57]:
model.to(device)
model.eval()
with torch.no_grad():
    val_pairs = {'preds': [], 
                'labels': []}
    for id, batch in enumerate(val_loader):
        inputs = batch['input_ids'].to(device)
        attnmasks = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        logits = model(inputs, attnmasks).logits
        # storing results 
        preds = F.softmax(logits, dim = 1)

        preds = preds.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        val_pairs['labels'].append(labels)
        val_pairs['preds'].append(preds)

    # Note that we need to compute by dataset instead of by batch
    val_preds = np.vstack(val_pairs['preds'])
    val_labels = np.vstack(val_pairs['labels'])
    val_acc = compute_accuracy(labels = val_labels, 
                                   preds = val_preds)
    raw_val_preds = val_preds
    validation_predictions = np.where(val_preds <= 0.5, 0, 1)
    
    print("🍎 Finished inferencing.")
    print(f"[info]|VAL F1 macro: {val_acc['macro f1']:.3f}, weighted: {val_acc['weighted f1']:.3f}")
    

🍎 Finished inferencing.
[info]|VAL F1 macro: 0.629, weighted: 0.856


In [31]:
len(validation_predictions)==len(valset)

True

In [59]:
predpath = './predictions/0.63_val_predictions.pkl'
labelspath = './predictions/encoded_30.0_labels.pkl'
probspath = './predictions//0.63_val_predictions_probs.pkl'
# joblib.dump(validation_predictions, predpath)
# joblib.dump(val_labels, labelspath)
# joblib.dump(raw_val_preds, probspath)
valpreds = joblib.load(predpath)
vallabels = joblib.load(labelspath)


### Run afterwards

In [16]:
# hyperparams
N = 217 # category數量

In [12]:
## f1_score(y_true, y_pred, average=None)
from sklearn.metrics import precision_recall_fscore_support as score
_, _,f1byclass, support = score(y_true = vallabels, y_pred = valpreds, zero_division = 1,  # no testing sample的class其f1取1
                     average = None)

In [13]:
# !conda install scikit-learn --yes

In [14]:
cat2idx = joblib.load(f'./category/cat2idx.pkl')
idx2cat = {v:k for k, v in cat2idx.items()}

In [17]:
poors = []
for i in range(N):
    if f1byclass[i] < 0.1:
        poors.append((idx2cat[i], f1byclass[i], support[i]))

In [18]:
poors # 71類 (category, category f1, support)
# support: number of this category in validation set 

[('巧拼地墊', 0.0, 4),
 ('瓦斯爐(廚房用)', 0.0, 14),
 ('甲油膠', 0.0, 3),
 ('兒童牙刷', 0.0, 86),
 ('成長幼兒奶粉', 0.0, 21),
 ('媽媽奶粉', 0.0, 9),
 ('羊乳', 0.0, 6),
 ('西式香腸', 0.0, 23),
 ('即飲無糖茶', 0.0, 81),
 ('快煮麵/乾拌麵', 0.0, 109),
 ('洗衣粉', 0.0, 73),
 ('洗髮精(嬰幼童/孕婦)', 0.0, 65),
 ('豆奶/豆乳', 0.0, 41),
 ('乳酸菌', 0.0, 8),
 ('兒童牙膏', 0.0, 69),
 ('卸甲液', 0.0, 1),
 ('卸妝乳', 0.0, 50),
 ('卸妝油', 0.0, 47),
 ('卸妝濕巾', 0.0, 5),
 ('卸妝霜', 0.0, 10),
 ('卸妝露', 0.0, 2),
 ('保溫杯', 0.0, 1),
 ('前導精華', 0.0, 3),
 ('眼部精華', 0.0, 15),
 ('按摩椅', 0.0, 2),
 ('洗衣球', 0.0, 97),
 ('洗碗機', 0.0, 4),
 ('眉膠筆', 0.0, 17),
 ('香氛機', 0.0, 5),
 ('BB霜', 0.0, 24),
 ('CC霜', 0.0, 14),
 ('原味牛乳', 0.0, 2),
 ('唇彩盤', 0.0, 4),
 ('唇膏', 0.0, 242),
 ('唇蜜', 0.0, 34),
 ('唇線筆', 0.0, 5),
 ('桌上電腦', 0.0, 11),
 ('氣炸鍋', 0.0, 21),
 ('消毒鍋', 0.0, 7),
 ('烘衣機/乾衣機', 0.0, 12),
 ('烤肉架', 0.0, 5),
 ('健腹器', 0.0, 8),
 ('熱水器', 0.0, 14),
 ('精華油', 0.0, 18),
 ('眼唇卸妝', 0.0, 34),
 ('眼影打底', 0.0, 5),
 ('眼影筆', 0.0, 12),
 ('眼影蜜', 0.0, 6),
 ('眼影霜', 0.0, 3),
 ('眼線液筆', 0.0, 25),
 ('野餐墊', 0.0, 8),
 ('棉褲/安睡褲', 0.

### Add 23.0 Attempt 1: String Exact Match

In [19]:
## load fasttext model 
from gensim.models import FastText
fmodelpath = f'./fasttext/fasttext.model'
fm = FastText.load(fmodelpath) 

In [20]:
#!conda install gensim==3.8.3
import gensim
# downgrade the hecking gensim for model loading compatibility 

In [21]:
print(gensim.__version__)

3.8.3


In [34]:
## get category keywords dictionary 
TOPN = 10
from collections import defaultdict
cat2keywords= defaultdict(list)
for cat in cat2idx.keys():
    if cat in fm.wv:
        cat2keywords[cat] = fm.wv.most_similar(cat, topn = TOPN)
        for subcat in cat.split('/'):
            # 有些會有 快煮麵/乾拌麵 形式
            cat2keywords[cat].append((subcat, 1))
    else:
        tokens = cat2keywords[cat]
        for token in tokens:
            cat2keywords.extend(fm.wv.most_similar(token, topn = TOPN))

In [42]:
# for (poorclass, f1, sup) in poors:
#     print(poorclass)
#     print(cat2keywords[poorclass])

In [36]:
poorpath = './category/poor_classf1.pkl'
joblib.dump(poors, poorpath)
cat2keywordspath = './category/cat2kw.pkl'
joblib.dump(cat2keywords, cat2keywordspath)

['./category/cat2kw.pkl']

In [None]:
for i, sample in enumerate(samples):
    hit_vector = np.zeros(N)
    name = sample['name']
    for cat, pair in cat2kw.items():
        ph_list, ph_probs = pair
        pat = re.compile(r'(?:{})'.format('|'.join(map(re.escape, ph_list))))
        found = re.findall(pat, name)
        cat_hit_probs = sum(ph_probs[x] for x in found)
        catid = cat2idx[cat]
        hit_vector[catid] = cat_hit_probs
    samples[i]['hitsprobs'] = hit_vector 
    

In [49]:
poors_ = [x[0] for x in poors]

In [50]:
poor_cat2kw = {k:v for k, v in cat2keywords.items() if k in poors_}

In [97]:
def getclass(vec, THR):
    classes = set()
    pred = np.where(vec >= THR)[0].tolist()
    for cls in pred:
        classes.add(idx2cat[cls])
    return classes

In [132]:
valprobs = joblib.load(probspath)

#### string matching...

In [133]:
import re
THR = 0.99
# use valprobs
changed = 0
val_names = valset['name']
for i, name in enumerate(val_names):
    pred_vector = valprobs[i, :]
    beforeclass = getclass(pred_vector)
    # print('before string match:', beforeclass)
    for cat, pairs in poor_cat2kw.items():
        kws = [x[0] for x in pairs]
        kwprobs = [x[1] for x in pairs]
        kwdict = {kw:kwprob for kw, kwprob in zip(kws, kwprobs)}
        pat = re.compile(r'(?:{})'.format('|'.join(map(re.escape, kws))))
        hits = re.findall(pat, name)
        # print('hit keywords:', hits)
        cat_hit_probs = sum(kwdict[hit] for hit in hits)
        catid = cat2idx[cat]
        
        pred_vector[catid] = cat_hit_probs
        afterclass = getclass(pred_vector)
    # print('after string match:', afterclass)
    if beforeclass != afterclass:
        changed += 1
        # print(f'Validation instance {i}:')
        # print('* before string match:', beforeclass)
        # print('* after string match:', afterclass)  

In [None]:
idx2cat[np.argmax(valprobs[0])] # 預測

In [71]:
idx2cat[np.argmax(vallabels[0])] # 實際標記

'貓零食'

In [134]:
# number of predictions changed: 
changed

5100

## 🏆 probability threshold ＝= 0.5
當初訓練的門檻也是調0.5，macro f1可至0.71左右

In [108]:
### np.where的threshold設0.5： 
compute_accuracy(vallabels, valprobs, threshold = 0.5)

{'macro f1': 0.7053355365535308, 'weighted f1': 0.8918203239915521}

比較：$ BERT訓練成果
🍎 VAL F1 macro: 0.629, weighted: 0.856

In [118]:
labels = sorted(cat2idx.items(), key = lambda x:x[1])
keys = [x[0] for x in labels]

In [120]:
revised_preds = np.where(valprobs <= 0.5, 0, 1)
print(classification_report(y_true=vallabels, 
                            y_pred = revised_preds, 
                            zero_division = 1, 
                            target_names=keys))

              precision    recall  f1-score   support

        人工淚液       0.95      0.95      0.95        21
        中式香腸       0.71      1.00      0.83        57
         化妝水       0.98      0.98      0.98       173
        成人牙膏       0.85      1.00      0.92       403
      水路/健行鞋       1.00      1.00      1.00         0
         火鍋料       0.98      1.00      0.99        57
          奶瓶       0.98      0.98      0.98        44
        巧拼地墊       0.02      0.75      0.04         4
        平板電腦       0.95      0.62      0.75        34
       筆記型電腦       0.83      0.92      0.87        48
       智慧型手機       0.85      0.93      0.89       148
    瓦斯爐(廚房用)       0.00      0.00      0.00        14
    瓦斯爐(攜帶式)       0.50      1.00      0.67        13
       甲片/甲貼       0.98      1.00      0.99        41
         甲油膠       0.75      1.00      0.86         3
          冰箱       1.00      1.00      1.00        38
        安全汽座       0.87      0.93      0.90        14
        成人牙刷       0.74    

## 🏆 probability threshold ＝0.97
我有試過0.7, 0.8, 0.999, ...目前看起來最佳的threshold如上，macro f1可至0.74左右。

簡單來說這個演算法流程就是**訓練多標籤BERT（macro f1=0.639），以其預測結果輔以fasttext所訓練出的top 10關鍵字matching（macro f1 = 0.738）**。
都是以資料集本身餵入處理與切割train test，未經過class數量的篩選與人為操作，如test set分布和valid set相似，可預估最後表現應該也是落在**0.63 ~0.74**之間。

🍀 我這邊應該就做到這一步，接下來我會開始把這些步驟串在一起寫inference script。

In [142]:
THR = 0.97
compute_accuracy(vallabels, valprobs, threshold = THR)

{'macro f1': 0.7416363784578881, 'weighted f1': 0.8963302546593493}

In [143]:
revised_preds = np.where(valprobs <= THR, 0, 1)
print(classification_report(y_true=vallabels, 
                            y_pred = revised_preds, 
                            zero_division = 1, 
                            target_names=keys))

              precision    recall  f1-score   support

        人工淚液       0.95      0.95      0.95        21
        中式香腸       0.71      1.00      0.83        57
         化妝水       0.98      0.97      0.98       173
        成人牙膏       0.85      1.00      0.92       403
      水路/健行鞋       1.00      1.00      1.00         0
         火鍋料       1.00      1.00      1.00        57
          奶瓶       0.98      0.98      0.98        44
        巧拼地墊       0.06      0.75      0.11         4
        平板電腦       1.00      0.59      0.74        34
       筆記型電腦       0.85      0.92      0.88        48
       智慧型手機       0.85      0.93      0.89       148
    瓦斯爐(廚房用)       0.00      0.00      0.00        14
    瓦斯爐(攜帶式)       0.50      1.00      0.67        13
       甲片/甲貼       0.98      1.00      0.99        41
         甲油膠       1.00      1.00      1.00         3
          冰箱       1.00      0.95      0.97        38
        安全汽座       0.93      0.93      0.93        14
        成人牙刷       0.74    