## Category: nn之前的做法

Author: Nana

Date: Created at 2022.4.16


In [1]:
# hyperparams
TOPN = 20 
N = 217 # category數量

In [2]:
from google.colab import drive
drive.mount('/content/drive')
maindir = '/content/drive/MyDrive/FinTech-final-project'
datadir = f'{maindir}/data'
spmdir = f'{maindir}/spm'
modeldir = f'{maindir}/models'
cat_df_path = f'{maindir}/東吳課程_發票資料集/品類資料集/cat_train_v2.csv'

Mounted at /content/drive


In [3]:
from collections import defaultdict
import numpy as np
import pandas as pd
import joblib
import random

## Fixing some bugs in pickled files

In [4]:
catdf = pd.read_csv(cat_df_path)
catD = defaultdict(int)
for k in catdf['product']:
  for x in k.split(';'):
    catD[x] += 1 

N = len(catD)
cat2idx = {} # a mapping that maps category to its corresponding (arbitray) index
for i, k in zip(range(N), catD.keys()):
  cat2idx[k] = i
prod_labels = []
# labs is the v1 category_labels.pkl, now deleted 
# for p, n, l in zip(catdf['product'], catdf['name'], labs):
#   segn = l['seg_name']
#   labels = np.zeros(N)
#   for x in p.split(';'):
#     if len(x.strip()) > 0:
#       catid = cat2idx[x]
#       labels[catid] = 1
#   prod_labels.append({'name':n, 'labels': labels, 'seg_name':segn, 'product': p.split(';')})

In [5]:
# check category number 
all(len(x['labels']) == 217 for x in prod_labels)

True

In [6]:
## fixing the category number bug
# import joblib
# joblib.dump(prod_labels, f'{datadir}/category/category_labels_v2.pkl')
# joblib.dump(cat2idx, f'{datadir}/category/cat2idx.pkl')

## Loading Fasttext model
`model.wv.most_similar()`

This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The method corresponds to the word-analogy and distance scripts in the original word2vec implementation.

In [7]:
fmodelpath = f'{modeldir}/fasttext.model'
from gensim.models import FastText
model = FastText.load(fmodelpath) 

In [8]:
model.wv.most_similar('人工淚液')

[('人工', 0.8812408447265625),
 ('舒點眼液', 0.8710706830024719),
 ('淚液', 0.866841197013855),
 ('參天', 0.8561886548995972),
 ('賜眼康點眼液', 0.8525574207305908),
 ('海滴', 0.8522036075592041),
 ('舒坦', 0.8486995697021484),
 ('朗點眼液', 0.8437364101409912),
 ('優點眼液', 0.8399668335914612),
 ('樂眼液', 0.8392975926399231)]

In [9]:
model.wv.most_similar('筆記型電腦')

[('SM-G', 0.9681061506271362),
 ('FZSGBRI', 0.9648809432983398),
 ('慕南', 0.9646647572517395),
 ('NZSABRI', 0.964486837387085),
 ('FZIGBRI', 0.9633705019950867),
 ('FZKGBRI', 0.9629587531089783),
 ('NZDEBRI', 0.9622004628181458),
 ('A515-55G-59', 0.9602923393249512),
 ('TCL', 0.957550585269928),
 ('LENOVOIdeaPad', 0.9565285444259644)]

### Attempt 1: String exact match 
may serve as a baseline 

以FastText內建的`model.wv.most_similar(c)` ∀ category c，存下最相似的前20個(`TOPN`) 詞彙與對應的機率（as recorded in model.wv）作為關鍵詞表（存在`cat2kw`）。隨後對每個product name，string match所有category對應的詞表（手動加入category本身詞彙，權重設定為100，以機率來看= 10000%，幾乎等於一定選它），然後將有match到的詞彙對應的機率加總（cat_hit_probs）。
對每個name，argmax cat_hit_probs最大者作為所屬category。
1. 只選單一category（argmax）。
2. 以下只random sample了10%的資料約9000筆來實驗。

Macro F1 Result: 0.39

In [10]:
!pip install -q -U ckip-transformers
from ckip_transformers.nlp import CkipWordSegmenter

[K     |████████████████████████████████| 4.0 MB 34.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 41.9 MB/s 
[K     |████████████████████████████████| 895 kB 71.0 MB/s 
[K     |████████████████████████████████| 77 kB 6.1 MB/s 
[K     |████████████████████████████████| 596 kB 69.4 MB/s 
[?25h

In [11]:
ws_driver = CkipWordSegmenter(level=3) # device 0 specifies GPU 

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/388M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
cat2idx = joblib.load(f'{datadir}/category/cat2idx.pkl')
cat_dict = joblib.load(f'{datadir}/category/category_labels_v2.pkl')

inverted map

In [45]:
idx2cat = {v:k for k, v in cat2idx.items()}

In [46]:
catkeys = list(cat2idx.keys())
cat2ws = {} # mapping: category to its segmented token 
ws = ws_driver(catkeys)
for k, segk in zip(catkeys, ws):
  cat2ws[k] = segk
joblib.dump(cat2ws, f'{datadir}/category/cat2ws.pkl')

Tokenization: 100%|██████████| 217/217 [00:00<00:00, 29982.01it/s]
Inference: 100%|██████████| 1/1 [00:09<00:00,  9.39s/it]


['/content/drive/MyDrive/FinTech-final-project/data/category/cat2ws.pkl']

In [47]:
# check if the category has an embedding in fasttext model
# if yes, take its most_similar phrase list
# otherwise, try get its tokens' most_similar phrase list (extend)

cat2keywords= defaultdict(list)
for cat in cat2idx.keys():
  if cat in model.wv:
    cat2keywords[cat] = model.wv.most_similar(cat, topn = TOPN)
  else:
    tokens = cat2keywords[cat]
    for token in tokens:
        cat2keywords.extend(model.wv.most_similar(token, topn = TOPN))

In [48]:
list(cat2keywords.items())[:5]

[('人工淚液',
  [('人工', 0.8812408447265625),
   ('舒點眼液', 0.8710706830024719),
   ('淚液', 0.866841197013855),
   ('參天', 0.8561886548995972),
   ('賜眼康點眼液', 0.8525574207305908),
   ('海滴', 0.8522036075592041),
   ('舒坦', 0.8486995697021484),
   ('朗點眼液', 0.8437364101409912),
   ('優點眼液', 0.8399668335914612),
   ('樂眼液', 0.8392975926399231),
   ('卸眼液', 0.8358579277992249),
   ('朗眼液', 0.8333317041397095),
   ('眼液', 0.8318336009979248),
   ('抗皺', 0.8296966552734375),
   ('試管', 0.8239986896514893),
   ('到期日', 0.821074903011322),
   ('鹽巴', 0.8204549551010132),
   ('2β', 0.8181430101394653),
   ('T+', 0.817972719669342),
   ('R視', 0.8170994520187378)]),
 ('中式香腸',
  [('SSO', 0.9588946104049683),
   ('三島', 0.9545806646347046),
   ('IKIGAI', 0.953130841255188),
   ('開始', 0.9530795812606812),
   ('惠百氏', 0.9530603289604187),
   ('薇拉', 0.9504801630973816),
   ('SS｜S｜', 0.9502218961715698),
   ('湯包麵', 0.9494392275810242),
   ('巨無霸', 0.9492870569229126),
   ('濾滴', 0.9490464925765991),
   ('SSO2', 0.9489832520484

In [49]:
# clean the cat2keywords 
# (remove probabilities and duplicates, add itself)
# clean too-short keywords <- 效果不好
cat2kw = {}
for k, v in cat2keywords.items():
  clean_v = list(set(x[0] for x in v))
  clean_v_probs = {x[0]:x[1] for x in v} # 如果x[0]有重複出現，機率那邊有點麻煩但我們先不處理
  if k not in clean_v:
    clean_v.append(k)
    clean_v_probs[k] = 100 # 100% similar # guarantee to take it
  cat2kw[k] = (clean_v, clean_v_probs)
# cat2kw here is a dictionary of category to its phrase list 

In [50]:
# use regex to find the exact match 
# sampling 0.1 ratio (for speed)
instances_num = len(catdf)
samp_num = int(instances_num/10)
randidx = random.sample(range(instances_num), samp_num)
samples = [cat_dict[i] for i in randidx]

In [51]:
len(samples)

9230

In [52]:
samples[0]

{'categories': ['潤髮乳'],
 'labels': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [53]:
import numpy as np
import re

In [54]:
# phrases: the phrases belonging to the category's most similar 
# re.findall(r"(?=("+'|'.join(phrases)+r"))", x)

# hit_probs

for i, sample in enumerate(samples):
    hit_vector = np.zeros(N)
    name = sample['name']
    for cat, pair in cat2kw.items():
        # v: phraselist belonging to k (category)
        # (?=(喵喵|咩咩))
        ph_list, ph_probs = pair
        pat = re.compile(r'(?:{})'.format('|'.join(map(re.escape, ph_list))))
        found = re.findall(pat, name)
        # the current category hits' probabilites
        cat_hit_probs = sum(ph_probs[x] for x in found)
        catid = cat2idx[cat]
        hit_vector[catid] = cat_hit_probs
    samples[i]['hitsprobs'] = hit_vector 
    

In [55]:
'''randomly pick some instances to examine the result'''
import random
rand = random.randint(0, len(samples))
print('index:', rand)
print(np.where(samples[rand]['hitsprobs'] > 0))
print(np.where(samples[rand]['labels'] ==1))

index: 1537
(array([165, 210]),)
(array([165]),)


In [56]:
x = rand
predcat = np.argmax(samples[x]['hitsprobs'])
print(idx2cat[predcat])
samples[x]['hitsprobs'] # 挑argmax，先暫時當one-label　

滑鼠


array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.  

**First try argmax**

-> simplify the problem into one-label task 
(multiclass, one answer) 

In [57]:
from sklearn.metrics import f1_score
y_true, y_pred = [], []
y_true_labels, y_pred_labels = [], []
for sample in samples:
  predcat= np.argmax(sample['hitsprobs'])
  pred = np.zeros(N); pred[predcat] = 1
  y_pred.append(pred) 
  y_pred_labels.append(idx2cat[predcat])
  y_true_labels.append(sample['product'])
  y_true.append(sample['labels'])

In [58]:
import random
r = random.randint(0, len(samples))
print('index:', r)
print(y_pred_labels[r])
print(y_true_labels[r])

index: 8727
即飲烏龍茶
['即飲無糖茶']


In [60]:
# 10% data; macro f1: 39%
print('10% ratio dataset macro f1:')
print(f1_score(y_true, y_pred,
                  average = 'macro'))
print('10% ratio dataset weighted f1:\n', f1_score(y_true, y_pred,
                  average = 'weighted'))

10% ratio dataset macro f1:


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.38226224614030324
10% ratio dataset weighted f1:
 0.48822762712050605


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## Clustering idea: 如何找centroids？
1. Method 1: (by word) ∀ category c，以category name直接找對應的embedding作為c的representative embedding (centroid)。
2. Method 2.1: (by member)
∀ category c，將屬於c下的每個member找切分train:test，以train split內的members找到平均embedding，隨後取他們的average作為該c的centroid。
3. Method 2.2: (by member)
∀ category c，將屬於c下的每個member找切分train:test，以train split內的members找到權重embedding，隨後取他們的average作為該c的centroid。所謂權重embedding，是因為資料內真正和category相關的token通常落在name的中後段，如「視舒坦人工淚液點眼液10ML」（人工淚液）、「697155@霧峰區農會酒莊酒粕香腸600G」（中式香腸），因此將後半段token的權重加乘至W倍（也可想其他加權法）。



Clustering idea: 找到centroids之後？應該有不只一種方法可以嘗試。
### Attempt 2: Clustering 2.2
centroid-finding Method 2.2然後對每個test split內的instance，同樣計算權重embedding，然後用distance function計算其和每個category centroids的距離，切一個距離threshold，將<=的都算1，>的算0。
取10%資料：

Macro F1 Result -> 0.28

In [28]:
name2vec = {}
EMBDIM = 200 # see notebook 12.0 
W = 4

In [29]:
def get_avg_embeddings(tokenlist):
  # if no token has embedding available, yields np.zeros(emb_dim)
  return np.mean([model.wv[tok] if tok in model.wv else np.zeros(EMBDIM) for tok in tokenlist], axis = 0)

def get_weighted_embeddings(tokenlist):
  # ref: https://stackoverflow.com/questions/29330792/weighted-averaging-a-list
  rate = np.ones(len(tokenlist))
  mid = int(len(rate)/2)
  # category name通常出現在產品名稱後半部
  rate = np.append(rate[:mid], (rate[mid:]+(W-1)))
  X = [model.wv[tok] if tok in model.wv else np.zeros(EMBDIM) for tok in tokenlist]
  return np.average(X, 
                    axis = 0,
                    weights=rate)

In [30]:
# step 1. compute weighted embeddings and save into cat_dict
'''
for i in range(len(cat_dict)):
  instance = cat_dict[i]
  toklist = instance['seg_name']
  wemb = get_weighted_embeddings(toklist)
  cat_dict[i]['weighted_emb'] = wemb 
  if not np.any(wemb): # if it is a zero vector
    print(f"{instance['name']} has no token embedding")
'''

'\nfor i in range(len(cat_dict)):\n  instance = cat_dict[i]\n  toklist = instance[\'seg_name\']\n  wemb = get_weighted_embeddings(toklist)\n  cat_dict[i][\'weighted_emb\'] = wemb \n  if not np.any(wemb): # if it is a zero vector\n    print(f"{instance[\'name\']} has no token embedding")\n'

In [31]:
# record category lists for each instance into cat_dict
'''
for i in range(len(cat_dict)):
   instance = cat_dict[i]
   cats = catdf.iloc[i]['product']
   cats = cats.split(';')
   cat_dict[i]['categories'] = cats
'''

"\nfor i in range(len(cat_dict)):\n   instance = cat_dict[i]\n   cats = catdf.iloc[i]['product']\n   cats = cats.split(';')\n   cat_dict[i]['categories'] = cats\n"

In [32]:
# save back into cat_dict
'''
joblib.dump(cat_dict, f'{datadir}/category/category_labels_v2.pkl')
'''

"\njoblib.dump(cat_dict, f'{datadir}/category/category_labels_v2.pkl')\n"

In [33]:
TESTSIZE = 0.1
SEED = 42

In [34]:
# step 2. train-test split
from sklearn.model_selection import train_test_split
X = [(x['weighted_emb'], x['name']) for x in cat_dict]
y = [x['labels'] for x in cat_dict]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = TESTSIZE, random_state=SEED)
# 感覺要確保一下stratify

In [35]:
# step 3. compute centroids
member_count = np.zeros((217,))
centroids = np.zeros((217, 200))
for (emb,_), labels in zip(X_train, y_train):
    indices = np.nonzero(labels)[0].tolist()
    for catid in indices:
      member_count[catid] += 1
      centroids[catid, :] += emb
# averaging 
for catid in range(N):
  try:
    centroids[catid, :] = centroids[catid, :]/member_count[catid]
  except: 
    print(f'Category idx2cat[catid] has no training member.')

In [36]:
# naked eye看stratification（至少每類要train set裡出現一次）
member_count.astype(int)

array([ 116,  262,  792, 1870,    3,  262,  216,   18,  138,  255,  739,
         53,   62,  203,   16,  197,   45, 1173,  400,  374,  209,  129,
         25,  257,  116,   19,   82,   74,   76,  488,  371, 3302,  224,
        522,  398,  112,  518,  615,  769,  512,  296,  401,  510,  230,
        496,   48, 2808, 2554,  363, 1909,  318,  275, 1683,  147,  313,
        403,  440,   39,  289,  101,  166,    3,  141,  224,  251,  109,
         16,   37,   10,  120, 1390,   69, 1266,  411,  208,  209,    3,
        516,   27,   66,  728,  787,   20,  115,  331,  130,  420,  278,
         23,  117,   55, 1089,   44,   92,  150,  710,  630,  108,   63,
         12,   67,    2,  352,   13, 1020,  147, 1064,  887,   22,   82,
         53,  454,   82,   31,   59,  102,   16,  181,  124,  350,  514,
         15,   57,    4, 1799,   87,  528,  632, 2132,  777, 1804,   70,
        307, 1220,  161,   28,  284,  139,  156, 1433,  141,   21,  170,
         41,   23,  333,   15,  230,  245,  121,  1

In [42]:
# step 4. compute distances (for each test instance, compute for each class)
# m: testing #
# N: category #
# dist_time_comp # time complexity for calling distance function once
# O(m*N*dist_tim_comp)
from numpy import linalg as LA # 2-norm for Euclidean
# LA.norm
TESTNUM = len(X_test)
Dist = np.zeros((TESTNUM, N))
for i in range(TESTNUM):
  x_test, x_name = X_test[i]
  for catid in range(N):
    centroid = centroids[catid, :]
    # print(x_test.shape, centroid.shape)
    dist = abs(LA.norm(x_test - centroid))
    if dist < 1e-2:
      print(idx2cat[id], x_name)
    Dist[i, catid] = dist

In [43]:
# to-do: 
# step 5. STATIC assign member by distance function # the closer the better
gap = 0.1
predictions = []
for i in range(TESTNUM):
    _, x_name = X_test[i]
    real_classes = [idx2cat[e] for e in np.nonzero(y_test[i])[0]]
    labels = np.zeros(N)
    c1_dist, c1 = np.min(Dist[i, :]), np.argmin(Dist[i, :])
    # temporarily mask the argmin class
    Dist[i, c1] = float('inf')
    c2_dist, c2 = np.min(Dist[i, :]), np.argmin(Dist[i, :])
    labels[c1] = 1
    # revise it back 
    Dist[i, c1] = c1_dist
    if i < 10: 
       print(f' * {i+1}. testing instance:', x_name, '\n\ttrue class:',real_classes)
       print(f'\tpredicted class:',idx2cat[c1])
    if abs(c1_dist - c2_dist) < gap:
            labels[c2] = 1
            if i < 10:
                print('\tpredicted class 2:',idx2cat[c2])
    predictions.append(labels)

 * 1. testing instance: 台茶12號紅茶 
	true class: ['即飲紅茶']
	predicted class: 即飲紅茶
 * 2. testing instance: 【adidas 愛迪達】ADIDAS  運動鞋 ULTRABOOST 20 City Pack Hype 中 跑步鞋 白(FX7816) 
	true class: ['運動鞋']
	predicted class: 跑步機
 * 3. testing instance: GATSBY潔面濕紙巾(控油型)超值包 42枚 
	true class: ['濕紙巾']
	predicted class: 人工淚液
 * 4. testing instance: 七喜汽水330ml 
	true class: ['碳酸飲料']
	predicted class: 氣泡水
 * 5. testing instance: 【白蘭】4X酵素極淨超濃縮洗衣精 1+6件組(瓶裝2.4KGx1+補充包1.5KGx6)奈米除菌-網 
	true class: ['洗衣精']
	predicted class: 洗衣球
 * 6. testing instance: 薑軍生薑洗髮精750ml 
	true class: ['洗髮精']
	predicted class: 洗髮精
 * 7. testing instance: 759871@J M 魔爪芒果能量碳酸飲料35 
	true class: ['維他命飲料']
	predicted class: 維他命飲料
 * 8. testing instance: 媚比琳反孔特霧無瑕嫩粉餅 
	true class: ['粉餅']
	predicted class: 粉餅
 * 9. testing instance: 【厝味乾拌麵】油醋小村釀(雙花粗麵) 
	true class: ['快煮麵/乾拌麵']
	predicted class: 熱水器
 * 10. testing instance: 『靖靖水晶礦石』   ST 2000 晶鑽 水晶 保養液 正品 玉礦石 皮革 木雕 木製家具 清潔 增亮 不殘留 
	true class: ['洗衣精']
	predicted class: 甲油膠


In [44]:
# step 6. metrics: calculate macro f1
f1_score(y_test, predictions, average = 'macro')

0.2809276066918831

In [41]:
f1_score(y_test, predictions, average = 'weighted')

0.4804689393722742

In [40]:
len(predictions)

9231