Category 

Author: Nana

Date: 2022.4.16 

In [3]:
from google.colab import drive
drive.mount('/content/drive')
maindir = '/content/drive/MyDrive/FinTech-final-project'
datadir = f'{maindir}/data'
spmdir = f'{maindir}/spm'
modeldir = f'{maindir}/models'
cat_df_path = f'{maindir}/東吳課程_發票資料集/品類資料集/cat_train_v2.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
import joblib
import random

## Fixing some bugs in pickled files

In [14]:
catdf = pd.read_csv(cat_df_path)
catD = defaultdict(int)
for k in catdf['product']:
  for x in k.split(';'):
    catD[x] += 1 

N = len(catD)
cat2idx = {} # a mapping that maps category to its corresponding (arbitray) index
for i, k in zip(range(N), catD.keys()):
  cat2idx[k] = i
prod_labels = []
# labs is the v1 category_labels.pkl, now deleted 
# for p, n, l in zip(catdf['product'], catdf['name'], labs):
#   segn = l['seg_name']
#   labels = np.zeros(N)
#   for x in p.split(';'):
#     if len(x.strip()) > 0:
#       catid = cat2idx[x]
#       labels[catid] = 1
#   prod_labels.append({'name':n, 'labels': labels, 'seg_name':segn, 'product': p.split(';')})

In [20]:
# check category number 
all(len(x['labels']) == 217 for x in prod_labels)

True

In [22]:
## fixing the category number bug
# import joblib
# joblib.dump(prod_labels, f'{datadir}/category/category_labels_v2.pkl')
# joblib.dump(cat2idx, f'{datadir}/category/cat2idx.pkl')

['/content/drive/MyDrive/FinTech-final-project/data/category/cat2idx.pkl']

## Loading Fasttext model
`model.wv.most_similar()`

This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The method corresponds to the word-analogy and distance scripts in the original word2vec implementation.

In [4]:
fmodelpath = f'{modeldir}/fasttext.model'
from gensim.models import FastText
model = FastText.load(fmodelpath) 

In [5]:
model.wv.most_similar('人工淚液')

[('人工', 0.8812408447265625),
 ('舒點眼液', 0.8710706830024719),
 ('淚液', 0.866841197013855),
 ('參天', 0.8561886548995972),
 ('賜眼康點眼液', 0.8525574207305908),
 ('海滴', 0.8522036075592041),
 ('舒坦', 0.8486995697021484),
 ('朗點眼液', 0.8437364101409912),
 ('優點眼液', 0.8399668335914612),
 ('樂眼液', 0.8392975926399231)]

In [10]:
model.wv.most_similar('筆記型電腦')

[('SM-G', 0.9681061506271362),
 ('FZSGBRI', 0.9648809432983398),
 ('慕南', 0.9646647572517395),
 ('NZSABRI', 0.964486837387085),
 ('FZIGBRI', 0.9633705019950867),
 ('FZKGBRI', 0.9629587531089783),
 ('NZDEBRI', 0.9622004628181458),
 ('A515-55G-59', 0.9602923393249512),
 ('TCL', 0.957550585269928),
 ('LENOVOIdeaPad', 0.9565285444259644)]

### String exact match 
may serve as a baseline 

In [6]:
!pip install -q -U ckip-transformers
from ckip_transformers.nlp import CkipWordSegmenter

In [9]:
ws_driver = CkipWordSegmenter(level=3) # device 0 specifies GPU 

In [7]:
cat2idx = joblib.load(f'{datadir}/category/cat2idx.pkl')
cat_dict = joblib.load(f'{datadir}/category/category_labels_v2.pkl')

inverted map

In [175]:
idx2cat = {v:k for k, v in cat2idx.items()}

In [13]:
catkeys = list(cat2idx.keys())
cat2ws = {} # mapping: category to its segmented token 
ws = ws_driver(catkeys)
for k, segk in zip(catkeys, ws):
  cat2ws[k] = segk
joblib.dump(cat2ws, f'{datadir}/category/cat2ws.pkl')

Tokenization: 100%|██████████| 217/217 [00:00<00:00, 21498.07it/s]
Inference: 100%|██████████| 1/1 [00:09<00:00,  9.50s/it]


['/content/drive/MyDrive/FinTech-final-project/data/category/cat2ws.pkl']

In [36]:
# check if the category has an embedding in fasttext model
# if yes, take its most_similar phrase list
# otherwise, try get its tokens' most_similar phrase list (extend)
TOPN = 20

cat2keywords= defaultdict(list)
for cat in cat2idx.keys():
  if cat in model.wv:
    cat2keywords[cat] = model.wv.most_similar(cat, topn = TOPN)
  else:
    tokens = cat2keywords[cat]
    for token in tokens:
        cat2keywords.extend(model.wv.most_similar(token, topn = TOPN))

In [37]:
list(cat2keywords.items())[:5]

[('人工淚液',
  [('人工', 0.8812408447265625),
   ('舒點眼液', 0.8710706830024719),
   ('淚液', 0.866841197013855),
   ('參天', 0.8561886548995972),
   ('賜眼康點眼液', 0.8525574207305908),
   ('海滴', 0.8522036075592041),
   ('舒坦', 0.8486995697021484),
   ('朗點眼液', 0.8437364101409912),
   ('優點眼液', 0.8399668335914612),
   ('樂眼液', 0.8392975926399231),
   ('卸眼液', 0.8358579277992249),
   ('朗眼液', 0.8333317041397095),
   ('眼液', 0.8318336009979248),
   ('抗皺', 0.8296966552734375),
   ('試管', 0.8239986896514893),
   ('到期日', 0.821074903011322),
   ('鹽巴', 0.8204549551010132),
   ('2β', 0.8181430101394653),
   ('T+', 0.817972719669342),
   ('R視', 0.8170994520187378)]),
 ('中式香腸',
  [('SSO', 0.9588946104049683),
   ('三島', 0.9545806646347046),
   ('IKIGAI', 0.953130841255188),
   ('開始', 0.9530795812606812),
   ('惠百氏', 0.9530603289604187),
   ('薇拉', 0.9504801630973816),
   ('SS｜S｜', 0.9502218961715698),
   ('湯包麵', 0.9494392275810242),
   ('巨無霸', 0.9492870569229126),
   ('濾滴', 0.9490464925765991),
   ('SSO2', 0.9489832520484

In [170]:
# clean the cat2keywords 
# (remove probabilities and duplicates, add itself)
# clean too-short keywords <- 效果不好
cat2kw = {}
for k, v in cat2keywords.items():
  clean_v = list(set(x[0] for x in v))
  clean_v_probs = {x[0]:x[1] for x in v} # 如果x[0]有重複出現，機率那邊有點麻煩但我們先不處理
  if k not in clean_v:
    clean_v.append(k)
    clean_v_probs[k] = 100 # 100% similar # guarantee to take it
  cat2kw[k] = (clean_v, clean_v_probs)
# cat2kw here is a dictionary of category to its phrase list 

In [171]:
# use regex to find the exact match 
# sampling 0.1 ratio (for speed)
instances_num = len(catdf)
samp_num = int(instances_num/10)
randidx = random.sample(range(instances_num), samp_num)
samples = [cat_dict[i] for i in randidx]

In [32]:
len(samples)

9230

In [40]:
samples[0]

{'labels': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'name': '思帕可氣泡水

In [44]:
N = 217

In [None]:
import numpy as np
import re

In [172]:
# phrases: the phrases belonging to the category's most similar 
# re.findall(r"(?=("+'|'.join(phrases)+r"))", x)

# hit_probs

for i, sample in enumerate(samples):
    hit_vector = np.zeros(N)
    name = sample['name']
    for cat, pair in cat2kw.items():
        # v: phraselist belonging to k (category)
        # (?=(喵喵|咩咩))
        ph_list, ph_probs = pair
        pat = re.compile(r'(?:{})'.format('|'.join(map(re.escape, ph_list))))
        found = re.findall(pat, name)
        # the current category hits' probabilites
        cat_hit_probs = sum(ph_probs[x] for x in found)
        catid = cat2idx[cat]
        hit_vector[catid] = cat_hit_probs
    samples[i]['hitsprobs'] = hit_vector 

In [217]:
'''randomly pick some instances to examine the result'''
import random
rand = random.randint(0, len(samples))
print('index:', rand)
print(np.where(samples[rand]['hitsprobs'] > 0))
print(np.where(samples[rand]['labels'] ==1))

index: 3858
(array([47]),)
(array([47]),)


In [218]:
x = rand
predcat = np.argmax(samples[x]['hitsprobs'])
print(idx2cat[predcat])
samples[x]['hitsprobs'] # 挑argmax，先暫時當one-label　

洗面乳


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.81915438, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

**First try argmax**

-> simplify the problem into one-label task 
(multiclass, one answer) 

In [176]:
from sklearn.metrics import f1_score
y_true, y_pred = [], []
y_true_labels, y_pred_labels = [], []
for sample in samples:
  predcat= np.argmax(sample['hitsprobs'])
  pred = np.zeros(N); pred[predcat] = 1
  y_pred.append(pred) 
  y_pred_labels.append(idx2cat[predcat])
  y_true_labels.append(sample['product'])
  y_true.append(sample['labels'])

In [200]:
import random
r = random.randint(0, len(samples))
print('index:', r)
print(y_pred_labels[r])
print(y_true_labels[r])

index: 1551
眉筆
['眉筆']


In [221]:
# 10% data; macro f1: 39%
print('10% ratio dataset macro f1:')
f1_score(y_true, y_pred,
         average = 'macro')

10% ratio dataset macro f1:


0.3910887678421653

### Embedding clustering

In [None]:
def get_avg_embeddings(tokenlist):
  emb_dim = 200 
  # if no token has embedding available, yields np.zeros(emb_dim)
  return np.mean([model.wv[tok] if tok in model.wv else np.zeros(emb_dim) for tok in tokenlist], axis = 0)
