## Import Libraries for preprocessing

In [7]:
import re
import requests
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import jieba
from sklearn.model_selection import train_test_split

## Preprocessing

```
 預處理步驟：
 1. 讀取 csv 檔案中的 item names -> 取得所有產品的 item names。
 2. 因在本機運算(使用cpu)，使用少部分資料進行模型訓練測試。
 3. 準備訓練、測試資料集 -> 因我們的資料為 unsupervised，故以jieba+預處理的結果作為斷詞之 ground_truth。
```

In [2]:
def getItems(file_name, item_col):
    df = pd.read_csv(file_name)
    return df[item_col].tolist()
    
def filterItem(item_name, stopwords, all_items):
    item_idx = [idx for idx, name in enumerate(all_items) if item_name in name.lower()]
    item_list = [all_item_names[idx].lower() for idx in item_idx]
    
    filter_item_name = []
    for item in item_list:
        item_name = []
        item = re.sub(r'[^\w]', ' ', item) # remove symbols
        item = re.sub(r"\s+", " ", item)  # remove 一個以上的空白字符
        split_item = item.strip().split(' ')
        
        for word in split_item:
            if word in stopwords:
                continue
            else:
                item_name.append(word)
        filter_item_name.append(" ".join(item_name))
    
    return filter_item_name

In [3]:
#1. get items
all_item_names = getItems('mall_all_item.csv', 'item_name')

# 2. filer items
item_name = 'iphone'
stopwords = []
filter_item_list = filterItem(item_name, stopwords, all_item_names)
print("Length of the filter items: ", len(filter_item_list))
print()
print(filter_item_list[:5])

Length of the filter items:  10114

['apple iphone 11 128gb 6 1吋 白 黑 紅 黃 紫 綠 神腦生活', 'apple iphone 11 pro max 256gb 6 5吋 灰 銀 金 綠 神腦生活', 'apple iphone xs 256gb 下殺8折', 'apple iphone 11 64gb 6 1吋 紅 白 黑 黃 綠 紫 神腦生活', 'apple iphone 11 pro max 64gb 6 5吋 灰 銀 金 綠 神腦生活']


In [4]:
# 3.the function that tag the word
def words_to_tags(sent):
    tags = []
    for word in sent:
        if len(word) == 1:
            tags.append('S')
        else:
            for i in range(len(word)):
                if i == 0:
                    tags.append('L')
                elif i == len(word) - 1:
                    tags.append('R')
                else:
                    tags.append('M')                    
    return tags

In [8]:
# 3. the function for preparing the train, test dataset
def prepareDataset(item_list):
    X = []
    y = []
    raw = []
    
    word_to_ix = {}
    word_to_ix['UNK'] = 0
    
    for item in item_list:
        split_item_sent = list(jieba.cut(item))
        raw.append(split_item_sent)
        
        sent_words = list("".join(split_item_sent))
        for word in sent_words:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
        
        X.append(sent_words)
        y.append(words_to_tags(split_item_sent))
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                            test_size=0.2, random_state=42)
    
    raw_train, raw_test = train_test_split(raw, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, raw_train, raw_test, word_to_ix

X_train, X_test, y_train, y_test, raw_train, raw_test, word_to_ix = prepareDataset(filter_item_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SAM~1.HSI\AppData\Local\Temp\jieba.cache
Loading model cost 1.261 seconds.
Prefix dict has been built successfully.


------------------------------------------------------

## Create a CRF model for word segmentation 

```
 使用 CRF model 去建構 word segmentation 的 feature function
 有許多不同的 package 支援 CRF model 運算，而在此使用的是 sklearn_crfsuite。
 
 CRF feature function 捕捉之資訊：
 1. 當前字詞前3個字，與後4個字。
 2. 此單字是否為第一個字 or 最後一個字。
 3. bigram, trigram 資訊。
```

In [9]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics

def extract_char_features(sent, position):
    char_features = {}
    for i in range(-3, 4):
        if len(sent) > position + i >= 0:
            char_features['char_at_%d' % i] = sent[position + i]
    
    if position == 0:
        char_features['First_word']: True
        char_features['Last_word'] : False
        pass
    
    elif position == 1:
        bigram = sent[position-1] + sent[position]
        
        char_features['bigram'] = bigram
        char_features['First_word']: False
        char_features['Last_word'] : False
    
    elif position == len(sent)-1:
        bigram = sent[position-1] + sent[position]
        trigram = sent[position-2] + sent[position-1] + sent[position]
        char_features['bigram'] = bigram
        char_features['trigram'] = trigram
        char_features['First_word']: False
        char_features['Last_word'] : True
        
    else:
        bigram = sent[position-1] + sent[position]
        trigram = sent[position-2] + sent[position-1] + sent[position]
        
        char_features['bigram'] = bigram
        char_features['trigram'] = trigram
        char_features['First_word']: False
        char_features['Last_word'] : False
    
    return char_features

In [10]:
def extract_sent_features(sent):
    sent_features = []
    
    for i in range(len(sent)):
        sent_features.append(extract_char_features(sent, i))
        
    return sent_features

## Prepare the feature of training set to fit the CRF model

In [11]:
crf_tagger = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                  min_freq=20,
                                  max_iterations=300,
                                  c1=10,
                                  c2=0.1,
                                  verbose=True)

feature_X = []
for sent in tqdm_notebook(X_train):
    feature_X.append(extract_sent_features(sent))

HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))




## Fit CRF model

In [None]:
crf_tagger.fit(feature_X, y_train)

loading training data to CRFsuite: 100%|█████████████████████████████████████████| 8091/8091 [00:04<00:00, 1737.18it/s]



Feature generation
type: CRF1d
feature.minfreq: 20.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 11003
Seconds required: 1.547

L-BFGS optimization
c1: 10.000000
c2: 0.100000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.50  loss=438682.15 active=9570  feature_norm=1.00
Iter 2   time=0.25  loss=219682.90 active=9470  feature_norm=7.34
Iter 3   time=0.26  loss=138347.61 active=9012  feature_norm=8.75
Iter 4   time=0.26  loss=109300.22 active=9233  feature_norm=9.26
Iter 5   time=0.26  loss=92615.43 active=9304  feature_norm=9.95
Iter 6   time=0.23  loss=83522.89 active=8977  feature_norm=10.73
Iter 7   time=0.20  loss=74198.92 active=8589  feature_norm=11.95
Iter 8   time=0.21  loss=60168.15 active=7797  feature_norm=14.17
Iter 9   time=0.22  loss=58336.94 active=7350  feature_norm=15.91
Iter

Iter 119 time=0.24  loss=27018.58 active=1526  feature_norm=42.89
Iter 120 time=0.22  loss=27016.37 active=1519  feature_norm=42.94
Iter 121 time=0.22  loss=27012.39 active=1515  feature_norm=42.98
Iter 122 time=0.24  loss=27010.51 active=1512  feature_norm=43.03
Iter 123 time=0.26  loss=27006.93 active=1511  feature_norm=43.07
Iter 124 time=0.23  loss=27004.94 active=1509  feature_norm=43.11
Iter 125 time=0.21  loss=27002.06 active=1510  feature_norm=43.14
Iter 126 time=0.22  loss=26999.65 active=1510  feature_norm=43.18
Iter 127 time=0.22  loss=26997.28 active=1508  feature_norm=43.21
Iter 128 time=0.23  loss=26994.64 active=1502  feature_norm=43.26
Iter 129 time=0.25  loss=26992.21 active=1492  feature_norm=43.29
Iter 130 time=0.24  loss=26989.81 active=1490  feature_norm=43.34
Iter 131 time=0.26  loss=26987.18 active=1490  feature_norm=43.37
Iter 132 time=0.23  loss=26985.29 active=1483  feature_norm=43.42
Iter 133 time=0.20  loss=26982.83 active=1481  feature_norm=43.45
Iter 134 t

Iter 243 time=0.45  loss=26903.92 active=1311  feature_norm=44.99
Iter 244 time=0.49  loss=26903.62 active=1312  feature_norm=45.00
Iter 245 time=0.46  loss=26903.38 active=1312  feature_norm=45.00
Iter 246 time=0.46  loss=26903.08 active=1311  feature_norm=45.00
Iter 247 time=0.48  loss=26902.85 active=1311  feature_norm=45.01
Iter 248 time=0.47  loss=26902.53 active=1312  feature_norm=45.01
Iter 249 time=0.46  loss=26902.28 active=1309  feature_norm=45.02
Iter 250 time=0.44  loss=26901.99 active=1309  feature_norm=45.02
Iter 251 time=0.48  loss=26901.74 active=1308  feature_norm=45.02
Iter 252 time=0.44  loss=26901.44 active=1306  feature_norm=45.03
Iter 253 time=0.51  loss=26901.21 active=1305  feature_norm=45.03
Iter 254 time=0.51  loss=26900.93 active=1306  feature_norm=45.03
Iter 255 time=0.48  loss=26900.70 active=1302  feature_norm=45.03
Iter 256 time=0.52  loss=26900.41 active=1302  feature_norm=45.03
Iter 257 time=0.40  loss=26900.12 active=1300  feature_norm=45.04
Iter 258 t

In [338]:
def segment(sent):
    tags = crf_tagger.predict_single(extract_sent_features(list(sent)))
    tokens = []
    tok = ""
    for ch, tag in zip(list(sent), tags):
        if tag in ['S', 'L'] and tok != "":
            tokens.append(tok)
            tok = ""
        tok += ch
    if tok:
        tokens.append(tok)
    return tokens

In [339]:
def compare(actual_toks, pred_toks):
    i = 0
    j = 0
    p = 0
    q = 0
    tp = 0
    fp = 0
    while i < len(actual_toks) and j < len(pred_toks):
        if p == q:
            if actual_toks[i] == pred_toks[j]:
                tp += 1
            else:
                fp += 1
            p += len(actual_toks[i])
            q += len(pred_toks[j])
            i += 1
            j += 1
        elif p < q:
            p += len(actual_toks[i])
            i += 1
        else:
            fp += 1
            q += len(pred_toks[j])
            j += 1
    return tp, fp, len(actual_toks)
    
def score(actual_sents, pred_sents):
    tp = 0
    fp = 0
    total = 0
    for actual_toks, pred_toks in zip(actual_sents, pred_sents):
        tp_, fp_, total_ = compare(actual_toks, pred_toks)
        tp += tp_
        fp += fp_
        total += total_
    recall = float(tp) / total
    precision = float(tp) / (tp + fp)
    f1 = 2.0 * recall * precision / (recall + precision)
    return recall, precision, f1

In [341]:
pred = []
actual = []
for sent in tqdm_notebook(raw_test):
    pred.append(segment("".join(sent)))
    actual.append(sent)

recall, precision, f1_score = score(actual, pred)

HBox(children=(IntProgress(value=0, max=2023), HTML(value='')))




In [342]:
print("Recall: ", recall)
print("Precision: ", precision)
print("F1 score: ", f1_score)

Recall:  0.9524069358554625
Precision:  0.9621878764368402
F1 score:  0.9572724225835608
