In [1]:
from sklearn import tree
import pandas as pd 
import ast
import pickle
import numpy as np
import re
from ckiptagger import WS
from collections import Counter
import jieba


In [2]:
DATASET = "IVR"
WS_SET = ["jieba", "ckip", "mix_0", "mix_1"]
WORD_SEGMENTER = WS_SET[1]

---

In [3]:
if WORD_SEGMENTER == "ckip" or WORD_SEGMENTER.find("mix") != -1:
    ws = WS("../ckiptagger/data")

def get_model_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
#     df = df.sort_values(by=["labels"], ignore_index=True)
    src_labels = sorted(set(df.labels.tolist()))
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    texts = df["texts"]
    labels = df["labels"]
    num_labels = list(Counter(labels).values())
    
    data = {"corpus":[], "labels":[], "src_texts":[], "src_label":[]}
    
    for i, t in (enumerate(texts)):
        label = labels[i]

        if WORD_SEGMENTER == "ckip":
            sentence_seg = ws([t])[0]
        elif WORD_SEGMENTER == "jieba":
            sentence_seg = jieba.lcut(t)
        elif WORD_SEGMENTER == "mix_0":
            temp = ws([t])[0]
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += jieba.lcut(seg_t)
        elif WORD_SEGMENTER == "mix_1":
            temp = jieba.lcut(t)
            sentence_seg = []
            for seg_t in temp:
                sentence_seg += ws([seg_t])[0]
    
        sentence_seg = [seg_t for seg_t in sentence_seg if seg_t!=' ']
#         seg_texts = ' '.join(sentence_seg)
        
        data["corpus"] += [sentence_seg]
        data["labels"] += [label]
        data["src_texts"] += [t]
        data["src_label"] += [src_labels[label]]
    return data, num_labels # Dict[List], List[Int]



In [4]:
def get_tf(corpus, data):
    with open("extend_dataset/IVR/tags.pkl", 'rb') as fp:
        tags = pickle.load(fp)

    tf_list = []
    for texts in corpus:
        tf = [0] * len(tags)
        for text in texts:
            i = 0
            for key, value in tags.items():
                value = sorted(value, key=len, reverse=True)
                value = [v for v in value if len(v)==len(text)]
                if value == []:
                    i += 1
                    continue
                big_regex = re.compile('|'.join(map(re.escape, value)))
                temp = big_regex.findall(text)
                tf[i] += len(temp)
                i += 1
        tf_list += [tf]
    data["tf"] = tf_list
    return data

In [5]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

In [6]:
data_train = get_tf(data_train["corpus"], data_train)
data_test = get_tf(data_test["corpus"], data_test)

---

In [3]:
with open("extend_dataset/IVR/tags.pkl", 'rb') as fp:
    feature_names = pickle.load(fp)
feature_names = list(feature_names.keys())

In [4]:
#seg
# vec = np.stack(data_train["tf"])

# all
# data_train = pd.read_csv("data/IVR/train.tsv", sep='\t')
# vec = np.stack(data_train["tf"].apply(ast.literal_eval).tolist())
# data_train["src_label"] = data_train["labels"]
# data_test = pd.read_csv("data/IVR/test.tsv", sep='\t')

# rule
data_train = pd.read_csv("data/IVR/class_label.tsv", sep='\t')
vec = np.stack(data_train["vectors"].apply(ast.literal_eval).tolist())
data_train["src_label"] = data_train["labels"]

clf = tree.DecisionTreeClassifier()
clf = clf.fit(vec, data_train["labels"])

# rule uni
# data_train = pd.read_csv("data/IVR/class_label.tsv", sep='\t')
# data_train["vectors"] = data_train["vectors"].apply(ast.literal_eval).tolist()
# data_train["src_label"] = data_train["labels"]

label_set = sorted(set(data_train["src_label"]))

# rule uni
# vec = []
# for l in label_set:
#     df = data_train[data_train["labels"] == l]
# #     print(l, df["vectors"].shape[0])
#     if df["vectors"].shape[0] == 0:
#         continue
#     elif df["vectors"].shape[0] == 1:
#         vec += df["vectors"].tolist()
#     else:
#         temp = np.stack(df["vectors"])
#         vec += [np.logical_or.reduce(temp)]
# vec = np.stack(vec)
# print(vec.shape)

# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(vec, label_set)

In [5]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=feature_names,  
                     class_names=label_set,  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)
graph.render(filename='tree_all')
# graph
# tree.plot_tree(clf)

'tree_all.pdf'

In [6]:
def get_prediction(clf, test_data, vectors=None, show_err=False):
#     test_data = test_data[test_data["labels"] != "133_2"]
#     test_data = test_data[test_data["labels"] != "31_2"]
#     test_data = test_data[test_data["labels"] != "31_3"]
#     test_data = test_data[test_data["labels"] != "31_4"]
#     test_data = test_data[test_data["labels"] != "31_5"]

    if vectors is None:
        vectors = test_data.tf.apply(ast.literal_eval).tolist()
        vectors = np.stack(vectors)
    pred = clf.predict(vectors)
    
    flag = 0
    if isinstance(pred[0], np.int64):
        print("type convert")
        flag = 1
        pred = np.array([label_set[l] for l in pred])
    
    ans = test_data.labels.tolist()
    ans = np.array(ans)
    
#     print(type(pred[0]), ans)
    
    correct = np.count_nonzero(pred==ans)
    acc = correct / len(test_data["labels"])
    print(acc)

    
    if show_err:
        c = [0 for _ in range(63)]
        print("| ans | pred | texts | keyword |  |")
        print("|-|-|-|-|-|")
        if flag: # seg
            for idx in np.nonzero(pred!=ans)[0]:
                print(f"| {ans[idx]} | {pred[idx]} | {data_test['corpus'][idx]} | {[feature_names[i] for i in np.nonzero(data_test['tf'][idx])[0] ]} |  |")
        else:
            for idx in np.nonzero(pred!=ans)[0]:
                c[label_set.index(ans[idx])] += 1
                print(f"| {ans[idx]} | {pred[idx]} | {data_test['texts'][idx]} | { [feature_names[i] for i in np.nonzero(vectors[idx])[0] ] } |  |")
#                 print(vectors[idx])
        print(c, sum(c))
    return acc

In [7]:
test_data = pd.read_csv("data/IVR/test.tsv", sep='\t')
# seg
# vec = np.stack(data_test["tf"])
# get_prediction(clf, test_data, vec, show_err=True)

# all
get_prediction(clf, test_data, show_err=True)

0.44343065693430656
| ans | pred | texts | keyword |  |
|-|-|-|-|-|


NameError: name 'data_test' is not defined