## 🌈 Category（品類） Test Runner

 - Fintech Project

 - b06102020 楊晴雯 

 - 2022.05.27

### 1. BERT prediction

In [1]:
import joblib, sys
import numpy as np
import pandas as pd
import random
import torch 
import os
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, load_metric
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, classification_report

testpath = './data/品類資料集/cat_test_v2(question).csv'
checkpoint_path = './checkpoints/0510-0939/model.ckpt'
cat2idx_path = './category/cat2idx.pkl'
cat2kw_path = './category/cat2kw.pkl'

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
cat2idx = joblib.load(cat2idx_path)
idx2cat = {v:k for k, v in cat2idx.items()}

#### 1-1. Testing dataset 

In [75]:
from utilities import preproc, collate_fn, compute_accuracy

In [2]:
df = pd.read_csv(testpath)
df["id"] = df.index

In [42]:
testset = Dataset.from_pandas(df)

In [53]:
testset  = testset.map(preproc(), batched = True)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.load(checkpoint_path)
model.to(device)
print()




In [43]:
MODELNAME = 'bert-base-chinese'
batch_size = 20
test_loader = DataLoader(testset, 
                        batch_size = batch_size, 
                        collate_fn=collate_fn(), shuffle = False)

#### 1-2. Inferencing test dataset

In [54]:
model.eval()
def inference(test_loader):
    with torch.no_grad():
        test_pairs = {'preds': [], 
                    'labels': []}
        for id, batch in enumerate(test_loader):
            inputs = batch['input_ids'].to(device)
            attnmasks = batch['attention_mask'].to(device)
            
            logits = model(inputs, attnmasks).logits
            preds = F.softmax(logits, dim = 1)

            preds = preds.detach().cpu().numpy()
            if 'label' in batch:
                labels = batch['label'].to(device)
                labels = labels.detach().cpu().numpy()
                test_pairs['labels'].append(labels)
            
            test_pairs['preds'].append(preds)

        # Note that we need to compute by dataset instead of by batch
        test_preds = np.vstack(test_pairs['preds'])
        if 'label' in batch:
            test_labels = np.vstack(test_pairs['labels'])
            test_acc = compute_accuracy(labels = test_labels, 
                                       preds = test_preds)
        raw_test_preds = test_preds
        predictions = np.where(test_preds <= 0.5, 0, 1)

        print("🍎 Finished inferencing testloader.")
        if 'label' in batch:
            print(f"[info]|VAL F1 macro: {test_acc['macro f1']:.3f}, \
                  weighted: {test_acc['weighted f1']:.3f}")
        else: print("[info] No labels available for this dataset.")
        return predictions 

In [55]:
predictions = inference(test_loader)

🍎 Finished inferencing testloader.
[info] No labels available for this dataset.


In [56]:
predictions.shape

(108851, 217)

In [63]:
# saving 
from datetime import datetime
timenow = datetime.now().strftime('%m%d-%H%M')

In [71]:
testset_path = './dataset/encoded_testset_50.0'
testpred_path = f'./predictions/{timenow}_test_predictions.pkl'
testset.save_to_disk(testset_path)
joblib.dump(predictions, testpred_path)

['./predictions/0527-0801_test_predictions.pkl']

### 2. Addtional: String-matching

load poor classes examined from validation set 

load keywords mapping from classes to fasttext keywords

In [12]:
timeload = '0527-0801'
testset = testset_path = './dataset/encoded_testset_50.0'
testpred_path = f'./predictions/{timeload}_test_predictions.pkl'
import datasets
# predictions
predictions  = joblib.load(testpred_path)
# test dataset
testset = datasets.load_from_disk(testset_path)

In [4]:
cat2kw = joblib.load(cat2kw_path)

In [13]:
pc_path = f'./category/poor_classf1.pkl'
# poor classes in validation set: those classes that have under 0.1 f1 score in validation set 
poorclasses = joblib.load(pc_path)

In [6]:
poorclasses[0] 
# classname, val_f1, val_support 

('巧拼地墊', 0.0, 4)

In [7]:
poorc = [x[0] for x in poorclasses]

In [14]:
test_names = testset['name']

In [15]:
poor_cat2kw = {k:v for k, v in cat2kw.items() if k in poorc}

In [19]:
import re
THR = 0.99
# use valprobs
def string_match(names, predictions, verbose = 1):
    for i, name in enumerate(names):
        # pred vector is the prediction proabability vector of i-th validation data 
        pred_vector = predictions[i, :]
        # for the classes that have poor f1 score: 
        if verbose:
            print(f'{i+1}. {name}')
            print('\t orig:', idx2cat[np.argmax(pred_vector)], np.max(pred_vector))
        
        for cat, pairs in poor_cat2kw.items():
            
            
            kws = [x[0] for x in pairs]
            kwprobs = [x[1] for x in pairs]
            kwdict = {kw:kwprob for kw, kwprob in zip(kws, kwprobs)}
            # use regex to find if the keyword appears in validation data's `name` 
            pat = re.compile(r'(?:{})'.format('|'.join(map(re.escape, kws))))
            # `hits` means the number of unique keywords found in 'name'
            hits = re.findall(pat, name)
            # for those "hit keywords", add up their cos_similarity
            # I personally call the summed cos_sim `cat_hit_probs` (category hit probability)
            # which is not completely accurate so bear with it 
            
            # !!!! REVISION 16:52 !!!!
            cat_hit_probs = sum(kwdict[hit] for hit in hits)/len(hits) if len(hits) > 0 else 0
            if verbose:
                if cat_hit_probs > 0:
                    print('\t matched:', cat, cat_hit_probs)
            
            
            # find the class' corresponding index
            catid = cat2idx[cat]
            # revise the prediction vector by plugging for this class the `cat_hit_probs`!!!
            pred_vector[catid] = cat_hit_probs
        
    return predictions

In [20]:
revised_probs = string_match(test_names, 
                                   predictions, 
                                   verbose = 0)

### 3. generate final predictions 

In [21]:
best_THR = 0.990

In [22]:
revised_preds = np.where(revised_probs <= best_THR, 0, 1)

In [40]:
import time 
def generate_mapping(setnames, set_revised_preds, finals, tid):
    multilabels = []
    preds = []
    n = len(setnames)
    for i in range(n):
        # print(name)
        name=setnames[i]
        pred = set_revised_preds[i, :]
        predclasses = np.where(pred == 1)[0]
        # print('\t', predclasses)
        # 多標籤
        if len(predclasses) > 1:
            multilabels.append((i, name, predclasses))
        # probability切太高了沒有class被預測，讓它至少猜一個class 
        if len(predclasses) < 1:
            predclasses = [np.argmax(pred)]

        product = ';'.join([idx2cat[pclass] for pclass in predclasses])
        preds.append(product)
    assert 
    finals[tid] = {'preds': preds, 'multilabels': multilabels}

In [132]:
testsize//10 # 10885 # len(multilabels) 6

10885

takes too long, use threading 

In [62]:
# https://stackoverflow.com/questions/6893968/how-to-get-the-return-value-from-a-thread-in-python
import threading                                          
from threading import Thread
from math import ceil
testsize = revised_preds.shape[0]

split = 30
finals = [None]*split
threads = [None]*split
one_portion = ceil(testsize/split)

time_st = time.time()
for tid in range(split):
    start = tid*one_portion 
    end = min(start + one_portion, testsize)
    print(f'thread {tid} works on:', start, end) # starting and ending index of inference 
    names_th = testset['name'][start:end]
    preds_th = revised_preds[start:end]
    threads[tid] = Thread(target = generate_mapping, 
                         args = (names_th, preds_th, finals, tid))
    try:
        threads[tid].start()
    except:
        print(f'\tThread {tid} has error occurred.')
        
for tid in range(len(threads)):
    threads[tid].join()   
time_end = time.time()
print(f'Time spent on inferencing {testsize} samples with {split} threads: \
      \n\t{(time_end - time_st):.4f} seconds' )

thread 0 works on: 0 3629
thread 1 works on: 3629 7258
thread 2 works on: 7258 10887
thread 3 works on: 10887 14516
thread 4 works on: 14516 18145
thread 5 works on: 18145 21774
thread 6 works on: 21774 25403
thread 7 works on: 25403 29032
thread 8 works on: 29032 32661
thread 9 works on: 32661 36290
thread 10 works on: 36290 39919
thread 11 works on: 39919 43548
thread 12 works on: 43548 47177
thread 13 works on: 47177 50806
thread 14 works on: 50806 54435
thread 15 works on: 54435 58064
thread 16 works on: 58064 61693
thread 17 works on: 61693 65322
thread 18 works on: 65322 68951
thread 19 works on: 68951 72580
thread 20 works on: 72580 76209
thread 21 works on: 76209 79838
thread 22 works on: 79838 83467
thread 23 works on: 83467 87096
thread 24 works on: 87096 90725
thread 25 works on: 90725 94354
thread 26 works on: 94354 97983
thread 27 works on: 97983 101612
thread 28 works on: 101612 105241
thread 29 works on: 105241 108851
Time spent on inferencing 108851 samples with 30 thre

In [63]:
final_predictions = sum([d['preds'] for d in finals], [])
len(final_predictions) == testsize

True

In [64]:
len(final_predictions)

108851

In [57]:
testsize

108851

In [5]:
from datetime import datetime
timenow = datetime.now().strftime('%m%d-%H%M')
final_path = f'./predictions/{timenow}_test_pred.csv'
df['product'] = final_predictions
df.to_csv(final_path, index = False)

In [72]:
#!/usr/bin/env python
import psutil
# gives a single float value
psutil.cpu_percent()
# gives an object with many fields
psutil.virtual_memory()
# you can convert that object to a dictionary 
dict(psutil.virtual_memory()._asdict())

{'total': 33615777792,
 'available': 30548242432,
 'percent': 9.1,
 'used': 2578120704,
 'free': 7729889280,
 'active': 10354069504,
 'inactive': 13469753344,
 'buffers': 252899328,
 'cached': 23054868480,
 'shared': 15908864,
 'slab': 1507663872}