# 规则验证模块

## 主要任务

规则验证模块的主要任务，是将硬匹配和软匹配的结果进行二次打标来验证。因为规则的标注并非完全准确，原本需要人去验证，但是人力验证成本过高，因此使用模型来代替人进行验证。

这个模块需要一些训练语料来训练模型，同时需要上一轮的软硬匹配结果来进行二次打标。

## 具体流程

1. 模型训练
2. 模型评估
3. 模型预测 / 句子打标
4. 冲突检验


# 实验数据准备

将 sent_train 数据集中的句子和标签全部利用来训练模型

利用 Word2vec 将句子变成向量

In [31]:
# 导入实验数据
import pandas as pd

train_data = pd.read_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/data/sent_train/brother.csv")
verify_data = pd.read_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/data/sent_veri/brother.csv")

# 抽取句子部分
train_sent_set = train_data['sent']
verify_sent_set = verify_data['sent']

# 将 DataFrame 变成列表
def split_by_space(sentence_set):
    sent_word_list = []
    for sentence in sentence_set:
        sent_word_list.append(sentence.split())
    return sent_word_list

# train_sent_word_list = []
# for sent in train_sent_set.tolist():
#     train_sent_word_list.append(sent.split())

train_sent_word_list = split_by_space(train_sent_set)
verify_sent_word_list = split_by_space(verify_sent_set)


In [44]:
# # 加载和调用 Word2vec，构造模型的训练向量
import numpy as np
from gensim.models import Word2Vec

model = Word2Vec.load("/Users/xuhaoshuai/GitHub/HumanIE_IPM/src/softmatch/word2vec.model")

def sent2vec(sent_word_list):
    X = []
    for sent_word in sent_word_list:
        sent_vec = np.zeros(100)
        word_vec_sum = np.zeros(100)
        for word in sent_word:
            word_vec_sum = word_vec_sum + model.wv[word]
        sent_vec = word_vec_sum / len(sent_word)
        X.append(sent_vec.tolist())
    return X

X_train = sent2vec(train_sent_word_list)
X_verify = sent2vec(verify_sent_word_list)

y_train = np.array(train_data['label'].tolist())
y_verify = np.array(verify_data['label'].tolist())

y_verify.shape

(1999,)

In [45]:
# 分割数据为训练集和测试集
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import f1_score

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 模型评估
hard_svm = svm.SVC()
soft_svm = svm.SVC()
hard_svm.fit(X_train, y_train)
soft_svm.fit(X_train, y_train)

y_verify_hard = hard_svm.predict(X_verify)
y_verify_soft = soft_svm.predict(X_verify)


hard_f1 = f1_score(y_verify, y_verify_hard, average='binary')
soft_f1 = f1_score(y_verify, y_verify_soft, average='binary')

1999
(1999,)


In [47]:
# 模型预测匹配输出的数据

# 载入软硬匹配—数据
pred_data_hard = pd.read_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/hard_match/pos/brother/1.csv")
pred_data_soft = pd.read_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/softmatch/brother/pos/1.csv")

pred_sent_set_hard = pred_data_hard['sent']
pred_sent_set_soft = pred_data_soft['sent']

pred_sent_word_list_hard = split_by_space(pred_sent_set_hard)
pred_sent_word_list_soft = split_by_space(pred_sent_set_soft)

X_pred_hard = sent2vec(pred_sent_word_list_hard)
X_pred_soft = sent2vec(pred_sent_word_list_soft)

X_pred_hard

[[-0.020137305038848093,
  0.043726476175444465,
  0.1808818323271615,
  -0.0380706787109375,
  0.11038175597786903,
  -0.431908871446337,
  0.08266687712499074,
  0.39036673094545093,
  -0.2444248657141413,
  -0.25339982339314054,
  0.09780645576704826,
  -0.35914087508405956,
  -0.09815782428319965,
  0.08451564555122916,
  0.06743282851363931,
  -0.15188791735896043,
  0.09475421333419425,
  0.047908858529159,
  -0.16961682082286902,
  -0.19031275702374323,
  -0.00705515846077885,
  -0.05545891235981669,
  0.2821055129170418,
  -0.045303628713424714,
  0.1818062516727618,
  -0.002492914308926889,
  -0.10656786443931716,
  0.0424214281208281,
  -0.1333129033446312,
  -0.13013673799910716,
  0.06601778441940301,
  -0.08842433669737407,
  0.19901022953646524,
  -0.35517099340047154,
  -0.15081017091870308,
  0.13749365934303828,
  -0.014465714039813195,
  -0.12947610872132437,
  -0.10327985197571772,
  -0.08979268851024765,
  0.12575122341513634,
  -0.22742313465901784,
  -0.3486839660

In [59]:
# 模型预测
y_pred_hard = hard_svm.predict(X_pred_hard)
y_pred_soft = soft_svm.predict(X_pred_soft)

# 合并预测结果
hard_merge = pd.concat([pred_data_hard, pd.DataFrame(y_pred_hard, columns=['hard_svm'])], axis=1)
soft_merge = pd.concat([pred_data_soft, pd.DataFrame(y_pred_soft, columns=['soft_svm'])], axis=1)

In [60]:
# 输出预测结果
hard_merge.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/merge/brother/hard/1.csv", index=None)
soft_merge.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/merge/brother/soft/1.csv", index=None)

In [86]:
# 结果筛选与保存
conflic_hard = hard_merge[hard_merge['hard_match'] != hard_merge['hard_svm']]
conflic_soft = soft_merge[soft_merge['softmatch'] != soft_merge['soft_svm']]

conflic_hard.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/conflic/brother/hard/1.csv", index=None)
conflic_soft.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/conflic/brother/soft/1.csv", index=None)

In [91]:
consistent_hard = hard_merge[hard_merge['hard_match'] == hard_merge['hard_svm']]
consistent_soft = soft_merge[soft_merge['softmatch'] == soft_merge['soft_svm']]

consistent_hard.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/consistent/brother/hard/1.csv", index=None)
consistent_soft.to_csv("/Users/xuhaoshuai/GitHub/HumanIE_IPM/output/verify/consistent/brother/soft/1.csv", index=None)