### pipeline 使用

In [None]:
from pycorrector.macbert.macbert_corrector import MacBertCorrector
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
import re

# 模型加载
nlp = MacBertCorrector("shibing624/macbert4csc-base-chinese").macbert_correct





### 规则 + pipeline

In [33]:
# 数据集加载
test_data = pd.read_csv('../Data/Dataset/test_data.csv', index_col=0)
raw_data = test_data['raw_data'].tolist()
right_data = test_data['right_data'].tolist()

# 函数定义
def C_trans_to_E(string):
    E_pun = u',.!?[]()<>"\':;'
    C_pun = u'，。！？【】（）《》“‘：；'
    table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}
    return string.translate(table)

def remove_pun(string):
    strinfo_1 = re.compile('[\s]+')
    string = strinfo_1.sub('', string)
    string = string.replace('[', '')
    string = string.replace(']', '')
    return string

# 用于装语料库
correct_list = []
right_list = []

# 遍历纠错

for source, target in zip(raw_data, right_data):
    # 纠正序列

    correct_list.append(list(nlp(C_trans_to_E(remove_pun(source)))[0]))
    right_list.append([list(C_trans_to_E(remove_pun(target)))])
    # print(nlp(C_trans_to_E(remove_pun(source)))[0])
    # print((C_trans_to_E(remove_pun(target))))

    # break



# 计算 BLEU
bleuscore = corpus_bleu(right_list, correct_list)
bleuscore

0.7838280058893062

### 模型训练及测试

In [59]:
import sys
sys.path.append("..")
from data_loader import VariantNW
import json
import pandas as pd

variantNW = VariantNW()

# 数据集加载
train_data = pd.read_csv('../Data/Dataset/train_data.csv', index_col=0)
raw_data = train_data['raw_data'].tolist()
right_data = train_data['right_data'].tolist()
data_ids = train_data.index


def replace(string):
    strinfo_1 = re.compile('[\s]+')
    string = strinfo_1.sub('', string)
    string = string.replace('[', '')
    string = string.replace(']', '')
    return string

# 记录不同ID
def wrongIds(source, target):
    wrong_ids = []
    for id, (i, j) in enumerate(zip(source, target)):
        if i == j:
            pass
        else:
            wrong_ids.append(id)
    return wrong_ids



# 遍历句子构建Json文件
dics = []

for id, source, target in zip(data_ids, raw_data, right_data):
    dic = {}
    variantNW.set_seqs(source, target)
    variantNW.propagate()
    aligned_seq1, aligned_seq2 = variantNW.traceback()

    dic["id"] = id
    dic["original_text"] = "".join(aligned_seq1).replace("[GAP]","-")
    dic["correct_text"] = "".join(aligned_seq2).replace("[GAP]","-")
    dic["wrong_ids"] = wrongIds(dic["original_text"], dic["correct_text"])

    # dic = json.dumps(dic, indent=4, ensure_ascii=False)
    dics.append(dic)

dics = json.dumps(dics, indent = 4, ensure_ascii=False)
print(dics)


# 保存json文件
with open("../Data/correct_train.json", "w") as fw:
    fw.write(dics)




# # 计算 BLEU
# bleuscore = corpus_bleu(right_list, correct_list)
# bleuscore

[
    {
        "id": 11091,
        "original_text": "权益服务：您领取的双突战法指标今天已有信号提醒啦，快登录大智慧b查看或现在打开链接h://t.v.e.cd/q使用。",
        "correct_text": "权益服务:您领取的双突战法指标今天已有信号提醒啦,快登录大智慧b查看或现在打开链接h://t.v.e.cd/q使用.",
        "wrong_ids": [
            4,
            24,
            57
        ]
    },
    {
        "id": 5082,
        "original_text": "【亲朋柒牌 】来就送68-668！牜ν   豆—帝主  扎金?更多好玩等你来 e5.q00k.com/?2963483 请点此申请",
        "correct_text": "【亲朋棋牌—】来就送68-668!牛————牛斗地主——炸金—更多好玩等你来—e5.q00k.com——2963483—请点此申请",
        "wrong_ids": [
            3,
            5,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            23,
            24,
            26,
            27,
            28,
            30,
            38,
            50,
            51,
            59
        ]
    },
    {
        "id": 3410,
        "original_text": "告诉黄某某,速速回家,明天有外访工作人员去家里核实  豆 豆 qian账— 单情况,微信：htsz6666",
        "correct_text": "告诉黄某某,速速回家,明天有外访工作人员去家里核实

In [65]:
# len("689277.com真崧 39圆.Ll3茺值就餸3%.笔bi侑—，自动到。nUv")
len("689277.com真送—39元.Ll3冲值就送3%.笔——必有,自动到.nUv")

40