# 从中文文本中抽取所有含指定关键字的语句

首先加载指定的扩展包：python-docx，re，json。

In [1]:
import re
import json
try:
    from docx import Document
except:
    !pip install python-docx
    from docx import Document

利用python-docx将docx文档读取后分段，生成段落字典。

In [2]:
def page_divider(doc_file):
    document = Document(doc_file)
    doc_dict = {}
    count = 1
    for par in document.paragraphs:
        if par.text != '':
            p1 = re.compile('^\s+|\s+$|\n')
            doc_dict[str(count)] = re.sub(p1, '', par.text)
            count += 1
    return doc_dict, count-1

利用正则表达式将各段落依照标点符号分句，并在句尾适当补充句号和引号。

In [3]:
def partext_tokenizer(partext):
    phrlist = re.split('[。？！][’”』」]|。|？|！', partext)
    phrase_list = []
    for s in phrlist:
        s += '。'
        if s.count('‘') > s.count('’'):
            s += '’'
        if s.count('“') > s.count('”'):
            s += '”'
        if s.count('『') > s.count('』'):
            s += '』'
        if s.count('「') > s.count('」'):
            s += '」'
        phrase_list.append(s)
    return phrase_list

将所有段落分句后逐个按关键字查找，将符合条件的整理成一个字典输出为json文件。

In [7]:
def text_seeker(doc_path, keyword, outpath, outjson=True, outtxt=True):
    doc_dict, pagenum = page_divider(doc_path)
    out_dict = {}
    for i in range(1, pagenum+1):
        num = str(i)
        ss = partext_tokenizer(doc_dict[num])
        out_dict[num] = [s for s in ss if keyword in s]
        if out_dict[num] == []:
            del out_dict[num]
    if outjson:
        with open(outpath+'.json', 'w', encoding='utf-8') as g:
            json.dump(out_dict, fp=g, indent=4)
    if outtxt:
        with open(outpath+'.txt', 'w', encoding='utf-8') as j:
            out_list = [s for ss in out_dict.values() for s in ss]
            j.write('\n'.join(out_list))
    print(len(out_dict))

设定输入文件路径，待查找关键字，输出文件路径

In [8]:
text_seeker('1. 春秋左传原文.docx', '在', 'output', outjson=True, outtxt=True)
text_seeker('01. 春秋左传 简体原文+译文 (1).docx', '在', 'output2', outjson=True, outtxt=True)

345
1745
