In [1]:
import jieba
import numpy as np
import pandas as pd
from ckiptagger import WS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

## Hyper Parameter

In [2]:
DATASET = "IVR"
WS_SET = ["jieba", "ckip", "mix_0", "mix_1"]
WORD_SEGMENTER = WS_SET[0]

## Utility Function

In [3]:
def get_tfidf(corpus, voc, idf):
    count_vectorizer = CountVectorizer(vocabulary=voc, token_pattern=r"(?u)\b\w+\b")
    tf = count_vectorizer.fit_transform(corpus)
    tfidf = (np.ma.log(tf.toarray()) + 1) * idf
    tfidf = tfidf.filled(0)
    
    return normalize(tfidf, norm='l2')

In [4]:
def get_similarity(test_tfidf, train_tfidf, num_labels=None):
    if num_labels is None:
        sim_array = cosine_similarity(test_tfidf, train_tfidf)
    else:
        train_tfidf_by_class = []
        start_idx = 0
        for i in num_labels:
            train_tfidf_by_class += [train_tfidf[start_idx : start_idx+i].mean(axis=0)]
            start_idx += i
        train_tfidf_by_class = np.stack(train_tfidf_by_class)
        sim_array = cosine_similarity(test_tfidf, train_tfidf_by_class)
    
    return np.array(sim_array)

In [5]:
def get_prediction(sim_array, test_data, train_data, sim_mode):
    correct = 0
    for i, ans in enumerate(test_data["label"]):
        if sim_mode == 0:
            sorted_arg = np.argsort(sim_array[i])
            sorted_arg = np.flip(sorted_arg)[:10] # 10 for top 10 similar
            sorted_pred = [train_data["label"][idx] for idx in sorted_arg]
            pred = max(set(sorted_pred), key=sorted_pred.count)
        else:
            pred = np.argmax(sim_array[i])
    
        if pred == ans:
            correct += 1
    acc = correct / len(test_data["label"])
    return acc

## Data Preprocess

In [6]:
if WORD_SEGMENTER == "ckip" or WORD_SEGMENTER.find("mix") != -1:
    ws = WS("../ckiptagger/data")

def get_model_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df.sort_values(by=["labels"], ignore_index=True)
    src_labels = sorted(set(df.labels.tolist()))
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    texts = df["texts"]
    labels = df["labels"]
    num_labels = list(Counter(labels).values())
    
    data = {"corpus":[], "label":[], "src_texts":[], "src_label":[]}
    
    for i, t in (enumerate(texts)):
        label = labels[i]
        
        if WORD_SEGMENTER == "ckip":
            sentence_seg = ws([t])[0]
        elif WORD_SEGMENTER == "jieba":
            sentence_seg = jieba.lcut(t)
        elif WORD_SEGMENTER == "mix_0":
            temp = ws([t])[0]
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += jieba.lcut(seg_t)
        elif WORD_SEGMENTER == "mix_1":
            temp = jieba.lcut(t)
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += ws([seg_t])[0]
    
        sentence_seg = [seg_t for seg_t in sentence_seg if seg_t!=' ']
        seg_texts = ' '.join(sentence_seg)
        
        data["corpus"] += [seg_texts]
        data["label"] += [label]
        data["src_texts"] += [t]
        data["src_label"] += [src_labels[label]]
    return data, num_labels # Dict[List], List[Int]



In [7]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")

0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.680 seconds.
Prefix dict has been built succesfully.
286it [00:14, 19.82it/s]


In [8]:
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

82it [00:05, 15.58it/s]


## Model

In [9]:
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
train_tfidf = tfidf_vectorizer.fit_transform(data_train["corpus"])
idf = tfidf_vectorizer.idf_
voc_list = tfidf_vectorizer.get_feature_names()
print(train_tfidf.shape)

(286, 557)


In [10]:
test_tfidf = get_tfidf(data_test["corpus"], voc_list, idf)
print(test_tfidf.shape)

(82, 557)


In [11]:
# SIM_MODE = 0
sim_array = get_similarity(test_tfidf, train_tfidf)
print(sim_array.shape)
get_prediction(sim_array, data_test, data_train, 0)

(82, 286)


0.8658536585365854

In [12]:
# SIM_MODE = 1
sim_array = get_similarity(test_tfidf, train_tfidf, num_labels)
print(sim_array.shape)
get_prediction(sim_array, data_test, data_train, 1)

(82, 41)


0.9512195121951219

---

In [19]:
voc = ["卡", "卡片"]
corpus = ["我的卡片多少錢", "account activate"]
count_vectorizer = CountVectorizer(vocabulary=voc, token_pattern=r"(?u)\b\w+\b")
tf = count_vectorizer.fit_transform(corpus)
tf.toarray()

array([[0, 0],
       [0, 0]])