In [1]:
from sklearn import tree
import pandas as pd 
import ast
import pickle
import numpy as np
import re
from ckiptagger import WS
from collections import Counter
import jieba

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
DATASET = "IVR"
WS_SET = ["jieba", "ckip", "mix_0", "mix_1"]
WORD_SEGMENTER = WS_SET[2]

---

In [None]:
if WORD_SEGMENTER == "ckip" or WORD_SEGMENTER.find("mix") != -1:
    ws = WS("../ckiptagger/data")

def get_model_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
#     df = df.sort_values(by=["labels"], ignore_index=True)
    src_labels = sorted(set(df.labels.tolist()))
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    texts = df["texts"]
    labels = df["labels"]
    num_labels = list(Counter(labels).values())
    
    data = {"corpus":[], "labels":[], "src_texts":[], "src_label":[]}
    
    for i, t in (enumerate(texts)):
        label = labels[i]

        if WORD_SEGMENTER == "ckip":
            sentence_seg = ws([t])[0]
        elif WORD_SEGMENTER == "jieba":
            sentence_seg = jieba.lcut(t)
        elif WORD_SEGMENTER == "mix_0":
            temp = ws([t])[0]
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += jieba.lcut(seg_t)
        elif WORD_SEGMENTER == "mix_1":
            temp = jieba.lcut(t)
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += ws([seg_t])[0]
    
        sentence_seg = [seg_t for seg_t in sentence_seg if seg_t!=' ']
#         seg_texts = ' '.join(sentence_seg)
        
        data["corpus"] += [sentence_seg]
        data["labels"] += [label]
        data["src_texts"] += [t]
        data["src_label"] += [src_labels[label]]
    return data, num_labels # Dict[List], List[Int]

In [26]:
def get_tf(corpus, data):
    with open("extend_dataset/IVR/tags.pkl", 'rb') as fp:
        tags = pickle.load(fp)

    tf_list = []
    for texts in corpus:
        tf = [0] * len(tags)
        for text in texts:
            i = 0
            for key, value in tags.items():
                value = sorted(value, key=len, reverse=True)
                value = [v for v in value if len(v)==len(text)]
                if value == []:
                    i += 1
                    continue
                big_regex = re.compile('|'.join(map(re.escape, value)))
                temp = big_regex.findall(text)
                tf[i] += len(temp)
                i += 1
        tf_list += [tf]
    data["tf"] = tf_list
    return data

In [27]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

In [28]:
data_train = get_tf(data_train["corpus"], data_train)
data_test = get_tf(data_test["corpus"], data_test)

---

In [3]:
# vec = np.stack(data_train["tf"]) #seg

# all
data_train = pd.read_csv("data/IVR/train.tsv", sep='\t')
vec = np.stack(data_train["tf"].apply(ast.literal_eval).tolist())
data_train["src_label"] = data_train["labels"]
data_test = pd.read_csv("data/IVR/test.tsv", sep='\t')


clf = tree.DecisionTreeClassifier()
clf = clf.fit(vec, data_train["labels"])

In [4]:
# df = pd.read_csv("data/IVR/class_label.tsv", sep='\t')
with open("extend_dataset/IVR/tags.pkl", 'rb') as fp:
    feature_names = pickle.load(fp)
feature_names = list(feature_names.keys())
label_set = sorted(set(data_train["src_label"]))

In [5]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=feature_names,  
                     class_names=label_set,  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)
graph.render(filename='tree_seg')
# graph
# tree.plot_tree(clf)

'tree_seg.pdf'

In [6]:
def get_prediction(clf, test_data, vectors=None, show_err=False):
#     test_data = test_data[test_data["labels"] != "133_2"]
#     test_data = test_data[test_data["labels"] != "31_2"]
#     test_data = test_data[test_data["labels"] != "31_3"]
#     test_data = test_data[test_data["labels"] != "31_4"]
#     test_data = test_data[test_data["labels"] != "31_5"]

    if vectors is None:
        vectors = test_data.tf.apply(ast.literal_eval).tolist()
        vectors = np.stack(vectors)
    pred = clf.predict(vectors)
    
    if type(pred[0]) == int:
        pred = np.array([label_set[l] for l in pred])
    
    ans = test_data.labels.tolist()
    ans = np.array(ans)
    
    correct = np.count_nonzero(pred==ans)
    acc = correct / len(test_data["labels"])
    print(acc)

    
    if show_err:
        print("ans\tpred\ttexts\tfeatures")
        if type(pred[0]) == int:
            for idx in np.nonzero(pred!=ans)[0]:
                print(f"{ans[idx]}\t{pred[idx]}\t{data_test['corpus'][idx]}\n{[feature_names[i] for i in np.nonzero(data_test['tf'][idx])[0] ]}\n")
        else:
            for idx in np.nonzero(pred!=ans)[0]:
                print(f"{ans[idx]}\t{pred[idx]}\t{data_test['texts'][idx]}\n{ [feature_names[i] for i in np.nonzero(vectors[idx])[0] ] }\n")
        
    return acc

In [7]:
test_data = pd.read_csv("data/IVR/test.tsv", sep='\t')
# seg
# vec = np.stack(data_test["tf"])
# get_prediction(clf, test_data, vec, show_err=True)
get_prediction(clf, test_data, show_err=True)

0.6733576642335767
ans	pred	texts	features
111_1	1121	信用卡可刷額度
['credit_card', 'debit_card', 'quota', 'spend']

111_1	1121	信用卡目前額度
['credit_card', 'current', 'debit_card', 'quota', 'spend']

111_1	1121	信用卡目前可用額度
['credit_card', 'current', 'debit_card', 'quota', 'remaining', 'spend']

111_1	1121	信用卡目前可使用的額度
['credit_card', 'current', 'debit_card', 'quota', 'spend']

111_1	1121	卡片目前可用額度
['credit_card', 'current', 'debit_card', 'quota', 'remaining', 'spend']

111_1	1121	卡片目前可刷額度
['credit_card', 'current', 'debit_card', 'quota', 'spend']

111_1	111_2	目前卡片可刷金額
['credit_card', 'current', 'debit_card', 'money', 'spend']

111_1	1121	目前信用卡額度
['credit_card', 'current', 'debit_card', 'quota', 'spend']

111_1	10	卡片刷卡上限
['credit_card', 'debit_card', 'spend']

111_3	121	信用卡帳務產出時間
['credit_card', 'debit_card', 'spend']

111_3	113_3	我的帳單日期
['bill', 'date', 'installment']

111_3	132_7	請問帳單關帳時間
['bill']

111_3	132_2	請問信用卡帳單周期
['bill', 'credit_card', 'debit_card', 'installment', 'spend']

111_3	113_3	請問卡片

0.6733576642335767