In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nlp_tools import vec_tool

In [61]:
from enum import Enum
class TextTypeLabel(Enum):
    Allege = "陈述"
    Identity = "认同"
    AskForCooperation = "寻求配合"
    Quiz = "提问"
    Sigh = "感叹"
    Prologue = "开场白"
    Closure = "结束语"

In [2]:
df = pd.read_csv("/home/wujinjie/TextPoseMatch/data/data.csv", encoding="gbk").dropna()

In [3]:
df = df[0:100]


In [4]:
df[df["text"]=="你来。"]

Unnamed: 0,type,text
18,寻求配合,你来。


In [5]:
target = df[df["type"]=="认同"]
target

Unnamed: 0,type,text
15,认同,真好，读得认真，听得仔细。
16,认同,老师为你们的互帮互学，点赞。
19,认同,小老师，声音真响亮。
22,认同,字音可真准。
27,认同,真是好办法。
41,认同,你们俩的见识可真多。
65,认同,很好。
89,认同,你觉得是这样的!


In [10]:
np.mat(np.array([1.0,2.0]).astype("float64")).shape

(1, 2)

In [11]:

def cos_similar(vec1, vec2):
    return cosine_similarity(vec1, vec2)
x1 = np.mat(np.array([1.0,2.0]).astype("float64"))
x2 = np.mat(np.array([2.0,1.0]).astype("float64"))
cosine_similarity(x1, x2)

array([[0.8]])

In [12]:
cosine_similarity(np.expand_dims(vec_tool.lookup("认同"),0), np.expand_dims(vec_tool.lookup("寻求配合"), 0))

array([[0.3152114]], dtype=float32)

In [62]:
# 生成cato_dicts
cato_dicts = {}
for l in [TextTypeLabel.Identity.value, TextTypeLabel.AskForCooperation.value, TextTypeLabel.Prologue.value]:
    target = df[df["type"]==l]
    vects = []
    for line in target["text"]:
        vects.append(np.expand_dims(vec_tool.lookup(line), 0))
    cato_dicts[l] = vects

In [63]:
def knn(cato_dicts, vec):
    scores = {}
    for k,vs in cato_dicts.items():
        scores[k] = 0.0
        for v in vs:
            s = cos_similar(v, vec)
            if s > 0.8:
                scores[k] += 1.0
            # scores[k] += cos_similar(v, vec)
        # scores[k] = scores[k]/len(vs)  # todo
    return scores

In [44]:
vec = np.expand_dims(vec_tool.lookup("机械钟。"),0)
knn(cato_dicts, vec)

{'认同': 0.0, '寻求配合': 0.0}

In [7]:
non_target = df[df["type"]!="认同"]
non_target.sample(len(target))

Unnamed: 0,type,text
56,陈述,“一分钟”里的“钟”字，表示的是时间的长度。
8,陈述,孩子们，生字都会读了吗？
94,陈述,好的，大家都有了自己的想法。
23,提问,第三行，谁来领读？
55,陈述,一分钟。
87,提问,课文里，为什么元yuan只多睡了一分钟，结果就迟到了二十分钟？
7,陈述,特别要注意带颜色的，生字的读音。
50,陈述,机械钟。
85,陈述,是的，还有好多好多。


In [60]:
t = df[df["type"]=="开场白"]
t

Unnamed: 0,type,text
0,开场白,上课！
1,开场白,起立！
2,开场白,老师好！
3,开场白,同学们好！
4,开场白,请坐。


In [26]:
len(df["type"].unique())

6

In [3]:
df.groupby("type").count()

Unnamed: 0_level_0,text
type,Unnamed: 1_level_1
寻求配合,30
开场白,5
感叹,2
提问,50
认同,12
陈述,105


In [10]:
from sklearn.model_selection import train_test_split
X = df["text"]
y = df["type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
# y_test

In [20]:
import numpy as np
d = np.array(X_train.tolist())
d.shape

(102,)

In [65]:
def recog_test_type(text: str):
    # S0. 开场白，结束语
    # S1. 认同or寻求配合
    vec = np.expand_dims(vec_tool.lookup(text),0)
    knn_res = knn(cato_dicts, vec)
    # {'认同': 0.0, '寻求配合': 0.0}
    top =  sorted(knn_res.items(), key=lambda x:x[1], reverse=True)[0]
    if top[1]>0.0:
        return top[0]
    # S2. 提问，感叹
    if text.endswith("!"):
        return TextTypeLabel.Sigh.value
    if text.endswith("?"):
        return TextTypeLabel.Quiz.value

    # S3. 陈述
    return TextTypeLabel.Allege.value

In [76]:
recog_test_type("起立！")

'开场白'