In [61]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import jieba
from ckiptagger import WS
import ast
import pickle
import re

## Hyper Parameter

In [59]:
DATASET = "IVR"
WS_SET = ["jieba", "ckip", "mix_0", "mix_1"]
WORD_SEGMENTER = WS_SET[1]

## Utility Function

In [80]:
def get_tf(corpus, data):
    with open("extend_dataset/IVR/tags.pkl", 'rb') as fp:
        tags = pickle.load(fp)

    tf_list = []
    for texts in corpus:
        tf = [0] * len(tags)
        for text in texts:
            i = 0
            for key, value in tags.items():
                value = sorted(value, key=len, reverse=True)
                value = [v for v in value if len(v)==len(text)]
                if value == []:
                    i += 1
                    continue
                big_regex = re.compile('|'.join(map(re.escape, value)))
                temp = big_regex.findall(text)
                tf[i] += len(temp)
                i += 1
        tf_list += [tf]
    data["tf"] = tf_list
    return data

In [3]:
def get_similarity(test_tfidf, train_tfidf, num_labels=None):
    if num_labels is None:
        sim_array = cosine_similarity(test_tfidf, train_tfidf)
    else:
        train_tfidf_by_class = []
        start_idx = 0
        for i in num_labels:
            train_tfidf_by_class += [train_tfidf[start_idx : start_idx+i].mean(axis=0)]
            start_idx += i
        train_tfidf_by_class = np.stack(train_tfidf_by_class)
        sim_array = cosine_similarity(test_tfidf, train_tfidf_by_class)
    
    return np.array(sim_array)

In [4]:
def get_prediction(sim_array, test_data, train_data, sim_mode):
    correct = 0
    for i, ans in enumerate(test_data["label"]):
        if sim_mode == 0:
            sorted_arg = np.argsort(sim_array[i])
            sorted_arg = np.flip(sorted_arg)[:10] # 10 for top 10 similar
            sorted_pred = [train_data["label"][idx] for idx in sorted_arg]
            pred = max(set(sorted_pred), key=sorted_pred.count)
        else:
            pred = np.argmax(sim_array[i])
    
        if pred == ans:
            correct += 1
    acc = correct / len(test_data["label"])
    return acc

## Data Preprocess

In [62]:
if WORD_SEGMENTER == "ckip" or WORD_SEGMENTER.find("mix") != -1:
    ws = WS("../ckiptagger/data")

def get_model_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
#     df = df.sort_values(by=["labels"], ignore_index=True)
    src_labels = sorted(set(df.labels.tolist()))
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    texts = df["texts"]
    labels = df["labels"]
    num_labels = list(Counter(labels).values())
    
    data = {"corpus":[], "label":[], "src_texts":[], "src_label":[]}
    
    for i, t in (enumerate(texts)):
        label = labels[i]

        if WORD_SEGMENTER == "ckip":
            sentence_seg = ws([t])[0]
        elif WORD_SEGMENTER == "jieba":
            sentence_seg = jieba.lcut(t)
        elif WORD_SEGMENTER == "mix_0":
            temp = ws([t])[0]
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += jieba.lcut(seg_t)
        elif WORD_SEGMENTER == "mix_1":
            temp = jieba.lcut(t)
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += ws([seg_t])[0]
    
        sentence_seg = [seg_t for seg_t in sentence_seg if seg_t!=' ']
#         seg_texts = ' '.join(sentence_seg)
        
        data["corpus"] += [sentence_seg]
        data["label"] += [label]
        data["src_texts"] += [t]
        data["src_label"] += [src_labels[label]]
    return data, num_labels # Dict[List], List[Int]



In [63]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")

In [64]:
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

In [81]:
data_train = get_tf(data_train["corpus"], data_train)

In [82]:
data_test = get_tf(data_test["corpus"], data_test)

In [84]:
len(data_train["tf"])

2144

## Model

In [89]:
tfidf_transformer = TfidfTransformer(sublinear_tf=True)

# df = pd.read_csv(f"data/{DATASET}/train.tsv", sep='\t')
# df.tf = df.tf.apply(ast.literal_eval)

train_tf = np.stack(data_train["tf"])
train_tfidf = tfidf_transformer.fit_transform(train_tf)
idf = tfidf_transformer.idf_
# idf = np.ones(train_tfidf.shape[1])

print(train_tfidf.shape)

(2144, 107)


In [90]:
# df = pd.read_csv(f"data/{DATASET}/test.tsv", sep='\t')
# df.tf = df.tf.apply(ast.literal_eval)

test_tf = np.stack(data_test["tf"])
test_tfidf = (np.ma.log(test_tf) + 1) * idf
test_tfidf = test_tfidf.filled(0)

test_tfidf = normalize(test_tfidf, norm='l2')
print(test_tfidf.shape)

(548, 107)


In [91]:
# SIM_MODE = 0
sim_array = get_similarity(test_tfidf, train_tfidf)
print(sim_array.shape)
get_prediction(sim_array, data_test, data_train, 0)

(548, 2144)


0.6496350364963503

In [92]:
# SIM_MODE = 1
sim_array = get_similarity(test_tfidf, train_tfidf, num_labels)
print(sim_array.shape)
get_prediction(sim_array, data_test, data_train, 1)

(548, 63)


0.0018248175182481751

---