In [2]:
### 文本比对

In [1]:
import difflib
import pandas as pd
import re

In [2]:

config = {
    "MAX_PARA_LEN":40
}
add_span = '<span class="add-text">%s</span>'
change_span = '<span class="mod-text">%s</span>'
delete_span = '<span class="del-text">%s</span>'

In [3]:
# 预处理
def E_trans_to_C(string):
    """
    中英文标点替换
    :param string:   “   “
    :return:
    """
    E_pun = u';:,.!?[]()<>"\''
    C_pun = u'；：，。！？【】（）《》“‘'
    table = {ord(f): ord(t) for f, t in zip(C_pun, E_pun)}
    return string.translate(table)

def string_filter(content):
    filter_strings = [" ", "#", "\n", "“", "”", "。"]
    
    for string in filter_strings:
        content = content.replace(string, "").strip()
    return content

def preprocess(content):
    content = E_trans_to_C(content)
    content = re.sub(r'[\n]{3,20000}', '\n\n', content)  # 换行符过滤
    content = content.replace("\n","\x02")
#     content = string_filter(content)
    return content

In [4]:
def convert_diffs(diffs):
    """
    转换diffs
    增加：(('', '\n'), (5, '\x00+能\x01'), True)              +
    修改：((49, '\x00-娱\x01'), (50, '\x00+舆\x01'), True)    ^
    删除：((53, '\x00-吧\x01'), ('', '\n'), True)             -
    不变：((2, '两'), (2, '两'), False)                       &
    :param diffs:
    :return:
    """
    froms = []
    tos = []
    flags = []
    fsflags = []
    for diff in diffs:
#         print(diff)
        if diff[2]:
            flags.append(True)
            f_char = diff[0][1]
            t_char = diff[1][1]
#             if "\x02" in f_char:
#                 fsflags.append("#")
            if "\x00-" in f_char and "\x00+" in t_char:
                fsflags.append("^")
            elif "\x00-" in f_char:
                fsflags.append("-")
            elif "\x00+" in t_char:
                fsflags.append("+")
        else:
            fsflags.append("&")
            flags.append(False)
        froms.append(diff[0][1])
        tos.append(diff[1][1])
#     print(len(froms))
    return froms,tos,fsflags,flags

In [5]:
class MyHtmlDiff(difflib.HtmlDiff):
    def __init__(self):
        super().__init__()
        self.MAX_PARA_LEN = config["MAX_PARA_LEN"]
        self.gs1 = None
        self.gs2 = None

    def _connect_char(self, str1, str2):
        """
        combine调用
        """
        str1 = str1.strip("\x01") + str2.strip("\x00+").strip("\x00^").strip("\x00-")
        return str1

    def _convert_flag(self,char,flag):
        """
        combine调用
        """
        if flag == "^":
            char = char.replace("\x00-", "\x00^").replace("\x00+", "\x00^")
        return char     

    def combine(self, diffs):
        """
        按字比对后进行合并,形成段落
        :param diffs:
        :return:
        ["",""]
        ["",""]
        [False,True]
        """
        froms, tos, fsflags, flags = convert_diffs(diffs)
        # if "\x00-" in froms[0] and "\x00+" in tos[0]:
        #     froms[0] = froms[0].replace("\x00-", "\x00^")
        #     tos[0] = tos[0].replace("\x00-", "\x00^")
        #     flags[0] = "^"
        f_paras = [froms[0]]
        t_paras = [tos[0]]
        f_flags = [flags[0]]
        for i in range(1, len(flags)):
            if len(f_paras[len(f_paras) - 1]) <= self.MAX_PARA_LEN:
                froms[i] = self._convert_flag(char=froms[i],flag=fsflags[i])
                tos[i] = self._convert_flag(char=tos[i], flag=fsflags[i])
                if fsflags[i] != fsflags[i - 1]:
                    f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1] + froms[i]
                    t_paras[len(t_paras) - 1] = t_paras[len(t_paras) - 1] + tos[i]
                # elif i == 0:
                #     f_paras[0] = f_paras[len(f_paras) - 1] + froms[i]
                #     t_paras[0] = t_paras[len(t_paras) - 1] + tos[i]
                else:
                    # fsflags[i] == fsflags[i - 1]:
                    f_paras[len(f_paras) - 1] = self._connect_char(str1=f_paras[len(f_paras) - 1], str2=froms[i])
                    t_paras[len(t_paras) - 1] = self._connect_char(str1=t_paras[len(t_paras) - 1], str2=tos[i])

                # f_paras[len(f_paras)-1] = f_paras[len(f_paras)-1] + froms[i]
                # t_paras[len(t_paras) - 1] = t_paras[len(t_paras) - 1] + tos[i]
                f_flags[len(f_flags) - 1] &= flags[i]
            else:
                # f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1].replace("\x01\x00-","").replace("\x01\x00^","")
                # t_paras[len(f_paras) - 1] = t_paras[len(t_paras) - 1].replace("\x01\x00+","").replace("\x01\x00^","")
                froms[i] = self._convert_flag(char=froms[i],flag=fsflags[i])
                tos[i] = self._convert_flag(char=tos[i], flag=fsflags[i])
                t_paras.append(tos[i])
                f_paras.append(froms[i])
                cur_flag = flags[i]
                f_flags.append(cur_flag)
        # f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1].replace("\x01\x00-", "").replace("\x01\x00^", "")
        # t_paras[len(f_paras) - 1] = t_paras[len(t_paras) - 1].replace("\x01\x00+", "").replace("\x01\x00^", "")
        for i in range(len(f_flags)):
            yield ((i, f_paras[i].strip()), (i, t_paras[i].strip()), flags[i])

    def _collect_lines(self, diffs):
        """
        重写原生方法
        :param diffs:
        :return:
        """
        diffs = self.combine(diffs)
        return super()._collect_lines(diffs)

    def top_k(self,from_segs,to_segs,flag_segs,flag_str="^",k=3):
        """
        找到长度排第K的长度
        :param from_list:
        :param to_list:
        :return:
        """
        indexs = []
        from_targets = []
        to_targets = []
        targets = []
        for i in range(len(flag_segs)):
            if flag_segs[i] == flag_str:
                indexs.append([i,len(from_segs[i])])
        # indexs = sorted(indexs, key=(lambda x: x[1]))[-k:]
        indexs = sorted(indexs, key=(lambda x: -x[1]))[0:k]
        indexs = sorted(indexs, key=(lambda x: x[0]))
        for ind in indexs:
            t_ind = ind[0]
            pre_ind = t_ind - 1
            back_ind = t_ind + 1
            from_seg = from_segs[t_ind].replace("\x00-","").replace("\x00+","").replace("\x01","")
            to_seg = to_segs[t_ind].replace("\x00-","").replace("\x00+","").replace("\x01","")

            if flag_str == '+':
                to_seg = add_span % to_seg
            elif flag_str == '-':
                from_seg = delete_span % from_seg
            else:
                from_seg = change_span % from_seg
                to_seg = change_span % to_seg

            from_targets.append(from_seg) # TODO
            to_targets.append(to_seg)

            if pre_ind >=0 and flag_segs[pre_ind] == "&":
                from_targets[-1] = from_segs[pre_ind][-10:] + from_targets[-1]
                to_targets[-1] = to_segs[pre_ind][-10:] + to_targets[-1]
                pass
            if back_ind < len(flag_segs) and flag_segs[back_ind] == "&":
                from_targets[-1] =  from_targets[-1] + from_segs[back_ind][0:10]
                to_targets[-1] = to_targets[-1] + to_segs[back_ind][0:10]
        for i in range(len(from_targets)):
            targets.append({"from":from_targets[i],"to":to_targets[i]})
        return targets
        # print(target_indexs)
        # return [from_targets,to_targets]


    def get_top_k(self,froms, tos, fsflags, flags):
        """
        count_diff调用
        找到topK段落
        :param froms:
        :param tos:
        :param fsflags:
        :param flags:
        :return:
        """
        try:
            flags = fsflags
            from_segs = []
            to_segs = []
            flag_segs = []
            from_segs.append(froms[0])
            to_segs.append(tos[0])
            flag_segs.append(flags[0])
            pre_flag = flags[0]

            for i in range(1, len(flags)):
                now_page = flags[i]
                if now_page != pre_flag:
                    from_segs.append(froms[i])
                    to_segs.append(tos[i])
                    flag_segs.append(flags[i])
                else:
                    from_segs[len(from_segs)-1] = self._connect_char(from_segs[len(from_segs)-1],froms[i])
                    to_segs[len(to_segs) - 1] = self._connect_char(to_segs[len(to_segs) - 1],tos[i])
                pre_flag = now_page
            print(from_segs)
            print(to_segs)
            print(flag_segs)
            change_fromtos = self.top_k(from_segs,to_segs,flag_segs,flag_str="^",k=3)
            add_fromtos = self.top_k(from_segs, to_segs,flag_segs, flag_str="+", k=3)
            delete_fromtos = self.top_k(from_segs, to_segs,flag_segs, flag_str="-", k=3)
            return {"add": add_fromtos, "delete": delete_fromtos, "change": change_fromtos}
        except Exception as e:
            print(e)
        return {}

    def count_diff(self, text1, text2):
        """
        按字比对后统计
        :param text1:
        :param text2:
        :return:
        """
        htmldiffer = difflib.HtmlDiff()
        htmldiffer._make_prefix()
        text1, text2 = htmldiffer._tab_newline_replace(text1.replace(" ",""), text2.replace(" ",""))
        diffs = difflib._mdiff(fromlines=text1, tolines=text2)
        froms, tos, fsflags, flags = convert_diffs(diffs)
        # print(froms)
        # print(tos)
        # print(fsflags)
        # print(flags)
        top_k_from_tos = self.get_top_k(froms, tos, fsflags, flags)
        flags = fsflags
        if len(flags) == 0:
            stat = {"add": 0, "delete": 0, "change": 0}
            return stat, top_k_from_tos

        add_count = 0
        delete_count = 0
        change_count = 0
        pre_flag = flags[0]
        if pre_flag == "+":
            add_count += 1
        elif pre_flag == "-":
            delete_count += 1
        elif pre_flag == "^":
            change_count += 1
        for i in range(1, len(flags)):
            now_page = flags[i]
            if now_page == "+" and now_page != pre_flag:
                add_count += 1
            elif now_page == "-" and now_page != pre_flag:
                delete_count += 1
            elif now_page == "^" and now_page != pre_flag:
                change_count += 1
            pre_flag = now_page
        stat = {"add": add_count, "delete": delete_count, "change": change_count}
        return stat,top_k_from_tos

In [6]:
### markdown比对
m1 = open("./case4.md").read()
m2 = open("./case5.md").read()
from IPython.core.display import HTML

differ = difflib.Differ()
diff_result = differ.compare(m1.split("\n"), m2.split("\n")) 
list(diff_result)[0:7]


['- 3232假设Client端发起中断连接请求，也就是发送FIN报文。Server端接到FIN报文后，意思是说"*我Client端没有数据要发给你了*"，但是如果你还有数据没有发送完成，则不必急着关闭Socket，可以继续发送数据。所以你先发送ACK，"*告诉Client端，你的请求我收到了，但是我还没准备好，请继续你等我的消息*"。这个时候Client端就进入FIN\\_WAIT状态，继续等待Server端的FIN报文。当Server端确定数据已发送完成，则向Client端发送FIN报文，"*告诉Client端，好了，我这边数据发完了，准备好关闭连接了*"。Client端收到FIN报文后，"*就知道可以关闭连接了，但是他还是不相信网络，怕Server端不知道要关闭，所以发送ACK后进入TIME\\_WAIT状态，如果Server端没有收到ACK则可以重传*。“，Server端收到ACK后，"*就知道可以断开连接了*"。Client端等待了2MSL后依然没有收到回复，则证明*Server端已正常关闭，那好，我Client端也可以关闭连接了*。Ok，TCP连接就这样关闭了！',
 '? ^ ^\n',
 '+ 1212假设Client端发起中断连接请求，也就是发送FIN报文。Server端接到FIN报文后，意思是说"*我Client端没有数据要发给你了*"，但是如果你还有数据没有发送完成，则不必急着关闭Socket，可以继续发送数据。所以你先发送ACK，"*告诉Client端，你的请求我收到了，但是我还没准备好，请继续你等我的消息*"。这个时候Client端就进入FIN\\_WAIT状态，继续等待Server端的FIN报文。当Server端确定数据已发送完成，则向Client端发送FIN报文，"*告诉Client端，好了，我这边数据发完了，准备好关闭连接了*"。Client端收到FIN报文后，"*就知道可以关闭连接了，但是他还是不相信网络，怕Server端不知道要关闭，所以发送ACK后进入TIME\\_WAIT状态，如果Server端没有收到ACK则可以重传*。“，Server端收到ACK后，"*就知道可以断开连接了*"。Client端等待了2MSL后依然没有收到回复，则证明*Server端已正常关闭，那好，我Client端也可以关闭连接了*。Ok，TCP连接就这

In [7]:
differ = difflib.HtmlDiff()
diff_result = differ.make_file(m1.split("\n"), m2.split("\n"))
# 输出显示html
# display(HTML(diff_result))
diff_result

'\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html>\n\n<head>\n    <meta http-equiv="Content-Type"\n          content="text/html; charset=utf-8" />\n    <title></title>\n    <style type="text/css">\n        table.diff {font-family:Courier; border:medium;}\n        .diff_header {background-color:#e0e0e0}\n        td.diff_header {text-align:right}\n        .diff_next {background-color:#c0c0c0}\n        .diff_add {background-color:#aaffaa}\n        .diff_chg {background-color:#ffff77}\n        .diff_sub {background-color:#ffaaaa}\n    </style>\n</head>\n\n<body>\n    \n    <table class="diff" id="difflib_chg_to0__top"\n           cellspacing="0" cellpadding="0" rules="groups" >\n        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\n        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\n        \n        <tbody>\n            <tr><td class="diff_next" id="di

### 多行比对

In [8]:
# mock data
gs1 = [{"nl":0},{"nl":0}]
gs2 = [{"nl":0},{"nl":0}]

text1="""两个可能
许吉如是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又被舆论绑架


第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""
text2="""三个可能


第一个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""

In [9]:
from IPython.core.display import HTML
html_differ = difflib.HtmlDiff()

diff_result = html_differ.make_file(text1.split("\n"), text2.split("\n")) # 带预预处理
# diff_result = my_html_differ.make_file("\x01" + text1.replace(" ", ""), "\x01" + text2.replace(" ", ""))
# 输出显示html
# diff_result
display(HTML(diff_result))

0,1,2,3,4,5
n,1,两个可能,n,1.0,三个可能
,2,许吉如是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又被舆论绑架,,,
,3,,,2.0,
,4,,,3.0,
t,5,第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）,t,4.0,第一个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


## V1.0
两个文件提取纯文本，拉成两个字符串进行比对，然后根据一定规则“分行”

In [10]:
# mock data
gs1 = [{"nl":0},{"nl":0}]
gs2 = [{"nl":0},{"nl":0}]

text1="""两个可，许吉如


是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又吧被娱论绑架
二维数组 可视化
第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""
text2="""两个可能，
许吉如是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又被舆论绑架


第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""

In [11]:
from IPython.core.display import HTML
my_html_differ = MyHtmlDiff()

diff_result = my_html_differ.make_file(preprocess("".join(text1)), preprocess("".join(text2))) # 带预预处理
# diff_result = my_html_differ.make_file("\x01" + text1.replace(" ", ""), "\x01" + text2.replace(" ", ""))
# 输出显示html
# diff_result
display(HTML(diff_result))

0,1,2,3,4,5
f,0,"两个可 , 许吉如是在配合节目组炒热度,顺便自己虐粉,但是这种可能性很低",f,0,"两个可能,许吉如 是在配合节目组炒热度,顺便自己虐粉,但是这种可能性很低"
,1,",毕竟个人口碑受损,又吧被娱论绑架二维数组 可视化第二个就是",,1,",毕竟个人口碑受损,又 被舆论绑架 第二个就是"
,2,",她被节目组玩了,当枪使也好,祭天也好,她最终背负着骂名离开了(离开当然是因为实力",,2,",她被节目组玩了,当枪使也好,祭天也好,她最终背负着骂名离开了(离开当然是因为实力"
t,3,"问题,但是并没有很洒脱的退出,这个观众都懂得)",t,3,"问题,但是并没有很洒脱的退出,这个观众都懂得)"

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


In [12]:
a = "<table><tr><td>12</td><td>\x02443</td></tr><tr><td>12</td><td>\x0234</td></tr></table>"
display(HTML(a))

0,1
12,443
12,34


### 比对信息统计

In [13]:
diff_stat = my_html_differ.count_diff(text1,text2)
diff_stat

['两个可', '\n', '，', '\n', '许吉如', '\x00-   \x01', '是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又', '\x00-吧\x01', '被', '\x00-娱\x01', '论绑架', '\x00-二\x01', '\x00-维数组可视化\x01', '第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）']
['两个可', '\x00+能\x01', '，', '\x00+ \x01', '许吉如', '\n\n\n', '是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，又', '\n', '被', '\x00+舆\x01', '论绑架', '\x00+ \x01', '\n\n\n\n\n\n', '第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）']
['&', '+', '&', '+', '&', '-', '&', '-', '&', '^', '&', '^', '-', '&']


({'add': 2, 'delete': 3, 'change': 2},
 {'add': [{'from': '两个可\n，', 'to': '两个可<span class="add-text">能</span>，'},
   {'from': '，\n许吉如', 'to': '，<span class="add-text"> </span>许吉如'}],
  'delete': [{'from': '许吉如<span class="del-text">   </span>是在配合节目组炒热度',
    'to': '许吉如\n\n\n是在配合节目组炒热度'},
   {'from': '毕竟个人口碑受损，又<span class="del-text">吧</span>被',
    'to': '毕竟个人口碑受损，又\n被'},
   {'from': '<span class="del-text">维数组可视化</span>第二个就是，她被节目',
    'to': '\n\n\n\n\n\n第二个就是，她被节目'}],
  'change': [{'from': '被<span class="mod-text">娱</span>论绑架',
    'to': '被<span class="mod-text">舆</span>论绑架'},
   {'from': '论绑架<span class="mod-text">二</span>',
    'to': '论绑架<span class="mod-text"> </span>'}]})

### 比对结果还原原格式
问题：两两比对的文件本身包含格式，格式不参与比对，只进行字符串比对，比对结果丢失原文格式信息，譬如段落（换行）,....

解决方案：
- 比对前保存原文格式
- 比对后将原文的格式加到比对结果上

todo 比对会做预处理，去除某些字符

### 比对结果格式优化

In [14]:

from bs4 import BeautifulSoup
def prettify_diff_result(diff_result):
    style = """
        *{
         margin:0
         padding:0;
        }
        .diff{
         display:none;
        }
        table.diff{
         margin:0 auto;
         display: block;
        }

        tbody{
            font-size: 14px;
            color: #333333; 
        }
        .diff_add{
                background-color:#97D8FF;
        }
        .diff_sub{
         background-color:#FF9696;
        }
        .diff_chg{
         background-color:#FFCD7A;
        }
    """
    # TODO
    soup = BeautifulSoup(diff_result)
    # soup.find(name='head').find(name="style").string = style
    # soup.find(name='table', attrs={'summary': 'Legends'}).clear()
    table = soup.find(name='table', attrs={'class': 'diff'})
    for tr in table.find_all('tr'):
        tr.find('td').extract()
        tr.find('td', attrs={'class': 'diff_next'}).clear()
    # return soup.find(name='table', attrs={'class': 'diff'}).prettify()
    return str(table)
display(HTML(prettify_diff_result(diff_result)))

0,1,2,3,4
0,"两个可 , 许吉如是在配合节目组炒热度,顺便自己虐粉,但是这种可能性很低",,0,"两个可能,许吉如 是在配合节目组炒热度,顺便自己虐粉,但是这种可能性很低"
1,",毕竟个人口碑受损,又吧被娱论绑架二维数组 可视化第二个就是",,1,",毕竟个人口碑受损,又 被舆论绑架 第二个就是"
2,",她被节目组玩了,当枪使也好,祭天也好,她最终背负着骂名离开了(离开当然是因为实力",,2,",她被节目组玩了,当枪使也好,祭天也好,她最终背负着骂名离开了(离开当然是因为实力"
3,"问题,但是并没有很洒脱的退出,这个观众都懂得)",,3,"问题,但是并没有很洒脱的退出,这个观众都懂得)"


## V2.0
预处理阶段保留"\n",拉成两个字符串比对后，按照\n进行“分行”

- (1) 按字: 比较通用,不同格式的可以比较
- (2) 按行: word间比较

In [15]:
class NewHtmlDiff(MyHtmlDiff):
    def __init__(self):
        super().__init__()
        self.MAX_PARA_LEN = config["MAX_PARA_LEN"]
        self.gs1 = None
        self.gs2 = None
        
    def combine_by_format(self,diffs):
        """
        按字比对后进行合并,考虑原文格式
        :param diffs:
        :return:
        """
        froms, tos, fsflags, flags = convert_diffs(diffs)
        f_paras = [froms[0]]
        t_paras = [tos[0]]
        f_flags = [flags[0]]
        # fs_flags = [fsflags[0]]
        for i in range(1, len(flags)):
            if len(f_paras[len(f_paras) - 1]) <= self.MAX_PARA_LEN:
                froms[i] = self._convert_flag(char=froms[i],flag=fsflags[i])
                tos[i] = self._convert_flag(char=tos[i], flag=fsflags[i])
                if fsflags[i] != fsflags[i - 1]:
                    f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1] + froms[i]
                    t_paras[len(t_paras) - 1] = t_paras[len(t_paras) - 1] + tos[i]
                # elif i == 0:
                #     f_paras[0] = f_paras[len(f_paras) - 1] + froms[i]
                #     t_paras[0] = t_paras[len(t_paras) - 1] + tos[i]
                else:
                    # fsflags[i] == fsflags[i - 1]:
                    f_paras[len(f_paras) - 1] = self._connect_char(str1=f_paras[len(f_paras) - 1], str2=froms[i])
                    t_paras[len(t_paras) - 1] = self._connect_char(str1=t_paras[len(t_paras) - 1], str2=tos[i])

                # f_paras[len(f_paras)-1] = f_paras[len(f_paras)-1] + froms[i]
                # t_paras[len(t_paras) - 1] = t_paras[len(t_paras) - 1] + tos[i]
                f_flags[len(f_flags) - 1] &= flags[i]
            else:
                # f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1].replace("\x01\x00-","").replace("\x01\x00^","")
                # t_paras[len(f_paras) - 1] = t_paras[len(t_paras) - 1].replace("\x01\x00+","").replace("\x01\x00^","")
                froms[i] = self._convert_flag(char=froms[i],flag=fsflags[i])
                tos[i] = self._convert_flag(char=tos[i], flag=fsflags[i])
                t_paras.append(tos[i])
                f_paras.append(froms[i])
                cur_flag = flags[i]
                f_flags.append(cur_flag)
        # f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1].replace("\x01\x00-", "").replace("\x01\x00^", "")
        # t_paras[len(f_paras) - 1] = t_paras[len(t_paras) - 1].replace("\x01\x00+", "").replace("\x01\x00^", "")
#         print(f_paras)
#         print(t_paras)
#         print(flags)
        for i in range(len(f_flags)):
            yield ((i, f_paras[i].strip()), (i, t_paras[i].strip()), flags[i])
        
        
    def format_diff(self,fromlines, tolines, fromdesc='', todesc='',context=False, numlines=5, *, charset='utf-8'):
        self._make_prefix()

        # change tabs to spaces before it gets more difficult after we insert
        # markup
        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)

        # create diffs iterator which generates side by side from/to data
        if context:
            context_lines = numlines
        else:
            context_lines = None
        diffs = difflib._mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
                      charjunk=self._charjunk)
#         for dif in diffs:
#             print(dif)
        # set up iterator to wrap lines that exceed desired width
        if self._wrapcolumn:
            diffs = self._line_wrapper(diffs)
        froms, tos, fsflags, flags = convert_diffs(diffs)
        f_paras = [froms[0]]
        t_paras = [tos[0]]
        f_flags = [flags[0]]
#         print(fsflags)
        for i in range(1, len(flags)):
            if fsflags[i] == "^":
                froms[i] = froms[i].replace("\x00-","\x00^")
                tos[i] = tos[i].replace("\x00+","\x00^")
            if fsflags[i] in ["-","&","^"]:
                if "\x02" in froms[i]:
                    f_paras.append("")
                else:
                    f_paras[len(f_paras) - 1] = f_paras[len(f_paras) - 1] + froms[i]
            if fsflags[i] in ["+","&","^"]:
                if "\x02" in tos[i]:
                    t_paras.append("")
                else:
                    t_paras[len(t_paras) - 1] = t_paras[len(t_paras) - 1] + tos[i]
        return (f_paras,t_paras)

In [16]:
text1="""两个可能
许吉如是在配合节目组炒热度，顺便自己虐粉，但是这种可能性很低，毕竟个人口碑受损，
第二个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，
这个观众都懂得）"""
text2="""三个可能
第一个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""

In [17]:
text1="""三个可能
第一个就是，她被节目组玩了，
当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""
text2="""三个可能
第一个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）"""

In [18]:
new_html_differ = NewHtmlDiff()
# fs,ts = new_html_differ.format_diff(preprocess("".join(text1)), preprocess("".join(text2))) # 带预预处理
fs,ts = new_html_differ.format_diff(text1.split("\n"), text2.split("\n")) # 带预预处理
# print(fs)
# print(ts)
for f in fs:
    print(f)
print("------")
for t in ts:
    print(t)

三个可能 -第一个就是，她被节目组玩了，
------
三个可能 +第一个就是，她被节目组玩了，当枪使也好，祭天也好，她最终背负着骂名离开了（离开当然是因为实力问题，但是并没有很洒脱的退出，这个观众都懂得）


In [19]:
m1 = open("./case4.md").read()
m2 = open("./case5.md").read()
from IPython.core.display import HTML
new_html_differ = NewHtmlDiff()
# differ = difflib.Differ()
diff_result = new_html_differ.format_diff(m1.split("\n"), m2.split("\n")) 
list(diff_result)[0:7]


# print(fs)
# print(ts)
for f in fs:
    print(f)
print("------")
for t in ts:
    print(t)

IndexError: list index out of range

In [20]:
table_template = """
    <table class="diff" id="difflib_chg_%(prefix)s_top"
           cellspacing="0" cellpadding="0" rules="groups" >
        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
        %(header_row)s
        <tbody>
%(data_rows)s        </tbody>
    </table>"""

display(HTML(table_template))

In [3]:
cs = '-二\x01\x00-维\x01\x00+数\x01\x00-组\x01\x00- \x01\x00-可\x01\x00-视\x01\x00-化\x01'
def connect(cs):
    ss = []
    cur = ""
    for c in cs:
        if c in ["-","+","^"]:
            if c == cur:
                ss[-1]= ""
                ss[-2] = ""
            else:
                ss.append(c)
                cur = c
        else:
            ss.append(c)
    return "".join(ss)
connect(cs)

'-二维\x01\x00+数\x01\x00-组 可视化\x01'

### <center>双输出</center>

In [22]:
import re
import zipfile
from xml.etree.cElementTree import XML

In [82]:
W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"  # Name Space
W_BODY = W + "body"
W_P = W + "p"
W_R = W + "r"
W_T = W + "t"
W_PR = W + "pPr"
W_JC = W + "jc"
W_IND = W + "ind"
W_FIRSTLINE = W + "firstLine"
W_B = W + "b"
W_I = W + "i"
W_RPR = W + "rPr"
W_FONTS = W + "rFonts"
W_HANSI = W + "hAnsi"
W_EASTASIA = W + "eastAsia"
W_CS = W + "cs"
W_ASCII = W + "ascii"
W_SZ = W + "sz"
W_SZCS = W + "szCs"
W_VAL = W + "val"
W_TBL = W + "tbl"
W_TR = W + "tr"
W_TC = W + "tc"

pattern_chinese_char = re.compile(u"[\u4e00-\u9fa5]")
def build_paragraph(p_node):
    paragraph = dict()
    paragraph["index"] = -1
    paragraph["type"] = "text"
    paragraph["indentation_first_line"] = None
    paragraph["horizon_alignment"] = ""
    paragraph["bold"] = False
    paragraph["italic"] = False
    paragraph["font_size"] = None
    paragraph["font_name"] = None
    paragraph["pattern_type"] = None
    paragraph["text"] = ""
    paragraph["is_table"] = False
    paragraph["is_attachment"] = False
    paragraph["level"] = 500
    paragraph["is_title"] = False

    r_list = p_node.findall(W_R)
    bold_count = 0
    italic_count = 0
    for i, r in enumerate(r_list):
        r_t_list = r.findall(W_T)
        if r_t_list:
            assert len(r_t_list) == 1
            r_t = r_t_list[0]
            if not r_t.text:
                continue
            italic_count += 1
            if r_t.text.strip():
#                 print(r_t.text)
#             if r_t.text:
                paragraph["text"] += r_t.text.strip().replace(" ","")
            if re.search(pattern_chinese_char, r_t.text):
                r_pr = r.find(W_RPR)
                if r_pr is not None and not paragraph["font_name"]:
                    r_pr_font = r_pr.find(W_FONTS)
                    if r_pr_font is not None:
                        if W_HANSI in r_pr_font.attrib:
                            paragraph["font_name"] = r_pr_font.attrib[W_HANSI]
                        elif W_EASTASIA in r_pr_font.attrib:
                            paragraph["font_name"] = r_pr_font.attrib[W_EASTASIA]
                        elif W_CS in r_pr_font.attrib:
                            paragraph["font_name"] = r_pr_font.attrib[W_CS]
                        elif W_ASCII in r_pr_font.attrib:
                            paragraph["font_name"] = r_pr_font.attrib[W_ASCII]
                if r_pr is not None and not paragraph["font_size"]:
                    r_pr_font_size = r_pr.find(W_SZCS)
                    if r_pr_font_size is not None:
                        paragraph["font_size"] = float(r_pr_font_size.attrib[W_VAL])
                    else:
                        r_pr_font_size = r_pr.find(W_SZ)
                        if r_pr_font_size is not None:
                            paragraph["font_size"] = float(r_pr_font_size.attrib[W_VAL])
        if bold_count == 0 and r.findall(".//" + W_B):
            paragraph["bold"] = True
        if italic_count == 0 and r.findall(".//" + W_I):
            paragraph["italic"] = True
        bold_count += 1

    p_pr = p_node.find(W_PR)
    if p_pr is not None:
        p_pr_ind = p_pr.find(W_IND)
        if p_pr_ind is not None:
            if W_FIRSTLINE in p_pr_ind.attrib:
                paragraph["indentation_first_line"] = float(p_pr_ind.attrib[W_FIRSTLINE])
        p_pr_alignment = p_pr.find(W_JC)
        if p_pr_alignment is not None:
            if p_pr_alignment.attrib[W_VAL] == "center":
                paragraph["horizon_alignment"] = "center"
    return paragraph

def build_table_paragraph(node):
    paragraph = dict()
    paragraph["index"] = -1
    paragraph["type"] = "table"
    paragraph["indentation_first_line"] = None
    paragraph["horizon_alignment"] = ""
    paragraph["bold"] = False
    paragraph["italic"] = False
    paragraph["font_size"] = None
    paragraph["font_name"] = None
    paragraph["pattern_type"] = None
    paragraph["text"] = ""
    paragraph["is_table"] = True
    paragraph["is_attachment"] = False
    paragraph["level"] = 500
    paragraph["is_title"] = False

    table_trs = node.findall(".//" + W_TR)
    table_html = "<table>"
    table_text = ""
    for tr in table_trs:
        table_html += "<tr>"
        tr_tcs = tr.findall(".//" + W_TC)
        for tc in tr_tcs:
            table_html += "<td>"
            tc_runs = tc.findall(".//" + W_R)
            for i, r in enumerate(tc_runs):
                r_t_list = r.findall(W_T)
                if r_t_list:
                    assert len(r_t_list) == 1
                    r_t = r_t_list[0]
                    print(r_t.text)
                    if not r_t.text.strip():
                        continue
                    if r_t.text.strip():
#                         table_html += r_t.text.strip()
                        table_text += r_t.text.strip() + "\x05"
#                         table_text += r_t.text.strip()
                        table_html += "%s"
#             table_text += "\x05"
            table_html += "</td>"
        table_html += "</tr>"
        
    table_html += "</table>"
    paragraph["text"] = table_text
    paragraph["trtd"] = table_html
    return paragraph

In [83]:
# docx 解析出段落及对应格式
def get_paraofdocx(doc_path):
    file =  open(doc_path, "rb")
    font_sizes = []
    font_names = []
    text_paras = []
    paragraphs = []
    zip_file = zipfile.ZipFile(file)
    document_xml = zip_file.read("word/document.xml")
    zip_file.close()
    tree = XML(document_xml)
    nodes = []
    for body in tree.getiterator(W_BODY):
        nodes = body.findall("*")
        break
    count = 0
    for node in nodes:
        if node.tag in [W_P,W_TBL]:
            if node.tag == W_P:
                paragraph = build_paragraph(node)
            elif node.tag == W_TBL:
                paragraph = build_table_paragraph(node)
            paragraph["index"] = count
            count += 1
            paragraphs.append(paragraph)
#             font_sizes.append(paragraph["font_size"])
#             font_names.append(paragraph["font_name"])
            text_paras.append(paragraph["text"])
    return paragraphs,text_paras,font_names,font_sizes
# self.text = self.text.strip()


In [84]:
demo1 = [((6, '\x00- \x01'), (6, '\x00+NEW\x01'), True), (('', '\n'), (7, '\x00+NEW\x01'), True), ((7, '保险代理合\x00-作协\x01议'), (8, '保险代理合议'), True)]

        

def left_spearte(item):
    l = item[0][1]
    r = item[1][1]
    if "\x00-" in l and "\00+" in r:
        l = l.replace("\x00-","\x04").replace("\x01","\x04")
    elif "\x00-" in l:
        l = l.replace("\x00-","\x03").replace("\x01","\x03")
    elif "\x00+" in l:
        l = l.replace("\x00+","\x02").replace("\x01","\x02")
    if "\x05" in l:
        l = "\x05"
    return (item[0][0]-1,l)

def right_spearte(item):
    l = item[0][1]
    r = item[1][1]
    if "\x00-" in l and "\00+" in r:
        r = r.replace("\x00+","\x04").replace("\x01","\x04")
    elif "\x00-" in r:
        r = r.replace("\x00-","\x03").replace("\x01","\x03")
    elif "\x00+" in r:
        r = r.replace("\x00+","\x02").replace("\x01","\x02")
    if "\x05" in r:
        r = "\x05"
    return (item[1][0]-1,r)

def left_isok(item):
    return item[0][0] != ""
def right_isok(item):
    return item[1][0] != ""

# list(map(left_spearte,(filter(left_isok,demo1))))
list(map(right_spearte,(filter(right_isok,demo1))))

[(5, '\x04NEW\x04'), (6, '\x02NEW\x02'), (7, '保险代理合议')]

In [85]:
class DocxDiff(MyHtmlDiff):
    def __init__(self):
        super().__init__()
        self.MAX_PARA_LEN = config["MAX_PARA_LEN"]
        
    def format_diff(self,fromlines, tolines, fromdesc='', todesc='',context=False, numlines=5, *, charset='utf-8'):
        self._make_prefix()

        # change tabs to spaces before it gets more difficult after we insert
        # markup
        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)

        # create diffs iterator which generates side by side from/to data
        if context:
            context_lines = numlines
        else:
            context_lines = None
        diffs = difflib._mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
                      charjunk=self._charjunk)
        
        return list(diffs)
#         for dif in diffs:
#             print(dif)
#     def merge_left(self,item,):
#         """
#         ((1, ''), (1, ''), False)
#         (0, '')
#         (0, '')
#         """
#         left = item[0]
#         for i in left[1]:
            
#         return (left[0]-1,)
        
            

In [2]:
paragraphs1,text_paras1,font_names1,font_sizes1 = get_paraofdocx("./case3.docx")
paragraphs2,text_paras2,font_names2,font_sizes2 = get_paraofdocx("./case4.docx")
# paragraphs2
# len(text_paras1[10])
# paragraphs1[0:10]

NameError: name 'get_paraofdocx' is not defined

In [37]:
def get_lenofline(text_paras):
    """
    获取每个段落的长度
    """
    lenoflines = []
    return [len(text_paras[i]) for i in range(len(text_paras))]
right_lens = get_lenofline(text_paras2)
left_lens = get_lenofline(text_paras1)
# left_lens

In [38]:
import json
differ = DocxDiff()
# 行List对比
# diff_result = differ.format_diff(json.loads(json.dumps(text_paras1)), 
#                                  json.loads(json.dumps(text_paras2))
#                                 ) 

# 字string对比
diff_result = differ.format_diff("".join(text_paras1), 
                                 "".join(text_paras2)
                                ) 
# diff_result[10:30]
right_diffs = list(map(right_spearte,(filter(right_isok,diff_result))))
left_diffs = list(map(left_spearte,(filter(left_isok,diff_result))))
# right_diffs
# right_diffs = list(map(right_spearte,(filter(right_isok,diff_result))))
# right_diffs
# right_diffs[0][1][-3]
# list(map(left_spearte,(filter(left_isok,diff_result))))
# for f in fs:
#     print(f)
# print("------")
# for t in ts:
#     print(t)

In [69]:
from functools import reduce
def mergeline(item0,item1):
    left = item0[1]
    right = item1[1]
#     print(left[-1])
#     print(right[0])
#     for 
    if len(right)>1 and len(left)>1:
#         if left[-1] == right[0] and left[-2]!="\x05" and right[1]!="\x05":
        if left[-1] == right[0]:
            return (0,left[0:-1]+right[1:])
    return (0,left+right)


chgs = """<span class="mod-text">"""
adds = """<span class="add-text">"""
dels = """<span class="del-text">"""
ends = """</span>"""

# def process_table(formats,lines):
#     for i in lines

def xuanran(line):
    new_line = []
    cur = 0
    for c in line:
        if c in ["\x02","\x03","\x04"]:
            if c != cur:
                cur = c
                c = c.replace("\x02",adds).replace("\x03",dels).replace("\x04",chgs)
            else:
                c = ends
                cur = 0
#         if c == "\x05":
#             c = "|"
        new_line.append(c)
    return "".join(new_line)
            
def format_merge_byline(lenoflines,diffs,formats=None):
    start = 0
    new_lines = []
    for index in range(len(lenoflines)):
        cur_line_diffs = diffs[start:start+lenoflines[index]]
        if len(cur_line_diffs)>0:
            cur_line = reduce(mergeline, cur_line_diffs)[1]
            cur_line = cur_line.replace("\x02\x02","\x02").replace("\x03\x03","\x03").replace("\x04\x04","\x04")
        else:
            cur_line = ""
#         print(cur_line) 
        start += lenoflines[index]
#         print(formats[index])
        if formats and formats[index]["is_table"]:
            cur_line = formats[index]["trtd"] % tuple(cur_line.split("\x05")[0:-1])
        new_lines.append(cur_line)

    return list(map(xuanran,new_lines))

In [1]:
# format_merge_byline(right_lens,right_diffs)
format_merge_byline(left_lens,left_diffs,paragraphs1)

### todo
1. 返回结果需要重新生成html，两边行数不一样

In [21]:
!jupyter nbconvert --to html --template full text_diff.ipynb

[NbConvertApp] Converting notebook text_diff.ipynb to html
[NbConvertApp] Writing 372350 bytes to text_diff.html


In [None]:
"""
+:   \x02 \x02
-:   \x03 \x03
^:   \x04 \x04
"""


def mergeflag(line):
#     cs = reversed(list(line))
#     cs.index("\x00-"),cs.index("\x00+"),cs.index("\x00^")
    keeps = [1]
    cur = ""
    for i in range(1,len(line)-1): # -2:0
        cc = line[i:i+2]
        pre = line[i-1]
        if cc in ["\x00-","\x00+","\x00+"]:
            if cur == cc: # 
                keeps.append(0)
            cur = cc
        keeps.append(1)
    keeps.append(1)
    return keeps
    
line = "\x02N\x02\x02E\x02\x02W\x02"
print(list(line))
mergeflag(line)

In [77]:
from IPython.core.display import HTML
hh = """
<table><tr><td><span class="del-text">项目</span></td><td><span class="del-text">项目准备阶段</span></td><td><span class="del-text">项目设计阶段</span></td><td><span class="del-text">项目开发阶段</span></td><td><span class="del-text">项目测试阶段</span></td><td><span class="del-text">项目验收</span></td></tr><tr><td><span class="del-text">项目1</span></td><td></td><td></td><td></td><td></td><td></td></tr><tr><td><span class="del-text">项目二</span></td><td></td><td></td><td></td><td></td><td></td></tr><tr><td><span class="del-text">序号</span></td><td><span class="del-text">需求名称</span></td><td><span class="del-text">测试人员</span></td><td><span class="del-text">测试结果</span></td></tr><tr><td></td><td></td><td></td><td><span class="del-text">测试总轮次</span></td><td><span class="del-text">执行用例总条数</span></td><td><span class="del-text">用例通过总条数</span></td><td><span class="del-text">遗留缺陷系数</span></td><td><span class="del-text">是否通过</span></td></tr><tr><td><span class="del-text">1</span></td><td><span class="del-text">cyprex1.7.9</span></td><td><span class="del-text">何秦川</span></td><td><span class="del-text">3</span></td><td><span class="del-text">72</span></td><td><span class="del-text">72</span></td><td><span class="del-text">0</span></td><td><span class="del-text">是</span></td></tr><tr><td><span class="del-text">2</span></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr></table>
"""
HTML(hh)

0,1,2,3,4,5,6,7
项目,项目准备阶段,项目设计阶段,项目开发阶段,项目测试阶段,项目验收,,
项目1,,,,,,,
项目二,,,,,,,
序号,需求名称,测试人员,测试结果,,,,
,,,测试总轮次,执行用例总条数,用例通过总条数,遗留缺陷系数,是否通过
1,cyprex1.7.9,何秦川,3,72,72,0,是
2,,,,,,,
