Multi-labeling task: preprocessing data into embeddings 

1. use fasttext  ["Enriching Word Vectors with Subword Information"](https://arxiv.org/abs/1607.04606)
2. build an nn (with bce loss)

## Drive Mounting, directory paths

In [1]:
# !pip -q install bertopic

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
maindir = '/content/drive/MyDrive/FinTech-final-project'
datadir = f'{maindir}/data'
spmdir = f'{maindir}/spm'
cat_df_path = f'{maindir}/東吳課程_發票資料集/品類資料集/cat_train_v2.csv'

## Tokenization: SentencePiece Tokenizer 
trained using all names in category dataset 

not very effective 

In [4]:
!pip install gensim



In [6]:
# 斷詞 
!pip -q install sentencepiece

[?25l[K     |▎                               | 10 kB 21.9 MB/s eta 0:00:01[K     |▌                               | 20 kB 13.3 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.1 MB/s eta 0:00:01[K     |█                               | 40 kB 9.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.7 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.6 MB/s eta 0:00:01[K     |██                              | 71 kB 5.7 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 4.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.2 MB/s eta 0:00:01[K     |███                             | 112 kB 5.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.2 MB/s eta 0:00:01[K     |███▌                            | 133 kB 5.2 MB/s eta 0:00:01[K     |███▉                            | 143 kB 5.2 MB/s eta 0:00:01[K  

In [7]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(f'{spmdir}/spm_allcats.model')

True

In [8]:
text = 'IM MEME我愛澄露潤唇膏002'
text = 'KissMe花漾美姬瞬翹自然捲纖長防水睫毛膏-02玫瑰棕6g'
text = '龜丸大豆本釀醬油500m'
sp.EncodeAsPieces(text) # not very effective? check why if there's time 

['▁', '龜', '丸', '大', '豆', '本', '釀', '醬', '油', '500', 'm']

## Tokenization: ckip-transformers 


In [9]:
!pip install -q -U ckip-transformers

[K     |████████████████████████████████| 4.0 MB 7.9 MB/s 
[K     |████████████████████████████████| 596 kB 34.1 MB/s 
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 34.8 MB/s 
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
[?25h

In [10]:
from ckip_transformers.nlp import CkipWordSegmenter
ws_driver = CkipWordSegmenter(level=3, device = 0) # device 0 specifies GPU 

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/388M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
import pandas as pd 
catdf = pd.read_csv(cat_df_path)
name_texts = catdf['name'].to_list()
ws = ws_driver(name_texts)

Tokenization: 100%|██████████| 92306/92306 [00:06<00:00, 14579.12it/s]
Inference: 100%|██████████| 361/361 [18:03<00:00,  3.00s/it]


## FastText 
other options: sentenceBERT

In [5]:
# toy-example 
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
print(common_texts[0])
print(len(common_texts))
# model = FastText(size=4, window=3, min_count=1)  # instantiate # vector_size or size??
# model.build_vocab(sentences=common_texts)
# model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

['human', 'interface', 'computer']
9


In [32]:
ws = [[x.replace(' ', '') for x in sublist] for sublist in ws]

## cleaning spaces 

In [41]:
# ws: word-segmented (tokenized) texts 
# embedding_dim: 200 
# context_window: 5 
model = FastText(size=200, window=5, min_count=1)  # instantiate # vector_size or size??
model.build_vocab(sentences=ws)
model.train(sentences=ws, total_examples=len(ws), epochs=10)  # train

In [24]:
vocab = model.wv.vocab

In [45]:
len(vocab)

72204

In [51]:
list(vocab.keys())[-30:]

['1472130198',
 '284906',
 '789909',
 '西西里亞',
 '山梨',
 '雞尾酒杯',
 '浪漫粉',
 '巴塔希慕思卡多',
 '７％',
 '712545',
 '亞斯提',
 '紀行',
 '庫存貨',
 '馬丁尼杯',
 '香檳杯',
 '三角杯',
 '甜酒杯',
 '高腳杯',
 '果酒杯',
 'Ｋ',
 '扶桑',
 '265545',
 '350ML',
 'ml-夏多內',
 '冬瓜蜜',
 '高球',
 '油嘴',
 '注酒器',
 '橙酒',
 '005 ']

In [29]:
len(ws) == len(catdf) # 約90000筆

True

In [44]:
model.most_similar('舒點眼液')

  """Entry point for launching an IPython kernel.


[('朗點眼液', 0.9927403330802917),
 ('優點眼液', 0.9889668822288513),
 ('樂眼液', 0.9885284304618835),
 ('朗眼液', 0.9881044626235962),
 ('視朗點眼液', 0.9842406511306763),
 ('賜眼康點眼液', 0.9805585741996765),
 ('眼液', 0.9708982706069946),
 ('點眼液', 0.9707175493240356),
 ('視敏眼液', 0.9613741040229797),
 ('卸眼液', 0.9606630802154541)]

In [52]:
import os
fasttextdir = f'{maindir}/models'
os.makedirs(fasttextdir, exist_ok = True)
fasttext_path = f'{fasttextdir}/fasttext.model'
model.save(fasttext_path)