# 分句 分词

In [25]:
from ltp import LTP
ltp = LTP()
sents = ltp.sent_split(["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
segment, _ = ltp.seg(sents)
segment

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [22]:
sents = ltp.sent_split(["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
print(sents)
segment, _ = ltp.seg(sents)
segment

['2021年他叫汤姆去拿外衣']


[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣']]

In [6]:
def tokenizer(text_list):
    sents = ltp.sent_split(text_list)
    segment, _ = ltp.seg(sents)
    return segment

text_list = ["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]
tokenizer(text_list)

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [36]:
import re
import jieba

def cut_sentence(text):
    sentence_list = re.split(r'(\.|\!|\?|。|！|？|\.{6})', text)
    return sentence_list

text = "2021年他叫汤姆去拿外衣。汤姆生病了。他去了医院。"
sentences = cut_sentence(text)
print(sentences)

for sent in sentences:
    segs = jieba.cut(sent)
    print("Full Mode: ", list(segs))  # 全模式


['2021年他叫汤姆去拿外衣', '。', '汤姆生病了', '。', '他去了医院', '。', '']
Full Mode:  ['2021', '年', '他', '叫', '汤姆', '去', '拿', '外衣']
Full Mode:  ['。']
Full Mode:  ['汤姆生', '病', '了']
Full Mode:  ['。']
Full Mode:  ['他', '去', '了', '医院']
Full Mode:  ['。']
Full Mode:  []


# 去除中英文标点符号

In [None]:
import string
from zhon.hanzi import punctuation

def remove_punctuation(words_list):
    new_words = [word for word in words_list if word not in punctuation and word not in string.punctuation]
    return new_words

# 去除文本中的数字

In [None]:
def remove_numbers(words_list): 
    renum_words = [word for word in words_list if not word.isnumeric()]
    new_words = [word for word in renum_words if not re.findall('-\d+',word)]
    return new_words

In [5]:
import string
from zhon.hanzi import punctuation
import re
import jieba

class Custom_segment:
    def __init__(self):
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        with open('baidu_stopwords.txt','r', encoding='utf-8') as f:
            content = f.readlines()
            stopwords = list(map(str.strip, content))
        return  stopwords

    def cut_sentence(self, text):
        sentence_list = re.split(r'(\.|\!|\?|。|！|？|\.{6})', text)
        return sentence_list

    def tokenizer(self, sent):
        segs = jieba.cut(sent, cut_all=False)
        rem_p_segments = self.remove_punctuation(segs)
        rem_sw_segments = self.del_stopwords(rem_p_segments)
        return list(rem_sw_segments)

    def remove_punctuation(self, words):
        new_words = (word for word in words if word not in punctuation and word not in string.punctuation)
        return new_words     

    def del_stopwords(self, words):
        new_words = (word for word in words if word not in self.stopwords)
        return new_words
        
    def __call__(self, text):
        sents = self.cut_sentence(text)
        segments = list(map(self.tokenizer, sents))
        segments = [segment for segment in segments if segment ]
        return sents, segments


if __name__=="__main__":
    text = "2021年他叫汤姆去拿《外衣》。汤姆生病了。他去了医院的住院部。"
    custom_segment = Custom_segment()
    sents, segments = custom_segment(text)
    
    print(f'sents:{sents}, segments:{segments}')
    segment = custom_segment.tokenizer('2021年他叫汤姆去拿《外衣》。')
    print(segment)

sents:['2021年他叫汤姆去拿《外衣》', '。', '汤姆生病了', '。', '他去了医院的住院部', '。', ''], segments:[['2021', '年', '汤姆', '去', '外衣'], ['汤姆生', '病'], ['去', '医院', '住院部']]
['2021', '年', '汤姆', '去', '外衣']


In [12]:
import string
from zhon.hanzi import punctuation
import re
from ltp import LTP
ltp = LTP()
full_name = '江苏鸿基节能新技术股份有限公司'
short_name = '鸿基节能'
ltp.add_words(words=[full_name,short_name])

class Custom_segment:
    def __init__(self):
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        with open('../text_process/baidu_stopwords.txt','r', encoding='utf-8') as f:
            content = f.readlines()
            stopwords = list(map(str.strip, content))
        return  stopwords

    def cut_sentence(self, text):
        sents = ltp.sent_split([text])
        return sents

    def tokenizer(self, sent):
        segs = ltp.seg([sent])[0][0]
        rem_p_segments = self.remove_punctuation(segs)
        rem_sw_segments = self.del_stopwords(rem_p_segments)
        return list(rem_sw_segments)

    def remove_punctuation(self, words):
        new_words = (word for word in words if word not in punctuation and word not in string.punctuation)
        return new_words     

    def del_stopwords(self, words):
        new_words = (word for word in words if word not in self.stopwords)
        return new_words
        
    def __call__(self, text):
        sents = self.cut_sentence(text)
        segments = list(map(self.tokenizer, sents))
        return sents, segments


if __name__=="__main__":
    text = "2021年他叫汤姆去拿《外衣》。汤姆生病了。他去了医院的住院部。"
    custom_segment = Custom_segment()
    sents, segments = custom_segment(text)
    print(f'sents:{sents}, segments:{segments}')
    segments = custom_segment.tokenizer('2021年他叫汤姆去拿《外衣》。')
    print(segments)

sents:['2021年他叫汤姆去拿《外衣》。', '汤姆生病了。', '他去了医院的住院部。'], segments:[['2021年', '汤姆', '去', '外衣'], ['汤姆', '生病'], ['去', '医院', '住院部']]
['2021年', '汤姆', '去', '外衣']


## Test

In [13]:
from docx import Document
import re
import multiprocessing as mp


class Docx_extract:
    def __init__(self, fp):
        self.document = Document(fp)

    def get_paragraph(self):
        '''
        Get the word document paragraph number and paragraph text that doesn't include heading.

        Parameters
        ----------
        f_p: docx file path

        Returns
        ----------
        out: generator
        Each element is A tuple containing (paragraph_number, paragraph.text)
        '''
        for  paragraph_number, paragraph in enumerate(self.document.paragraphs):
            if (paragraph.style.name =='Normal' or re.findall('正文', paragraph.style.name)) and len(paragraph.text.strip()) !=0:
                yield  paragraph_number, paragraph.text.strip()
    




if __name__=="__main__":
   
    fp = '../file_process/docx_process/data/Template.docx'
    docx_extract = Docx_extract(fp)
    paragraph_texts = docx_extract.get_paragraph()

    def left_sents_info():
        fp = '../file_process/docx_process/data/Template.docx'
        docx_extract = Docx_extract(fp)
        paragraph_texts = docx_extract.get_paragraph()
        for paragraph_number, paragraph_text in docx_extract.get_paragraph():
            left_sents, left_segments  = custom_segment(paragraph_text)
            for left_sentence, left_segment in zip(left_sents, left_segments):
                yield paragraph_number,  left_sentence, left_segment

In [15]:
L = []
for paragraph_number,  left_sentence, left_segment in left_sents_info():
    L.append(custom_segment.tokenizer(left_sentence))

In [16]:
L

[['江苏', '鸿基节能', '新', '技术', '股份', '有限公司'],
 ['住所',
  '南京',
  '浦口区',
  '高新技术',
  '开发区',
  '星火路',
  '11',
  '号',
  '动漫',
  '大厦',
  'A',
  '座',
  '9',
  '层'],
 ['首', '次', '公开', '发行', '股票', '创业板', '上市'],
 ['招股', '说明书'],
 ['申报', '稿'],
 ['保荐人', '主', '承销商'],
 ['住所', '贵州省', '贵阳市', '中华北路', '216', '号'],
 ['公司', '发行', '申请', '尚', '需', '深圳', '证券', '交易所', '中国', '证监会', '履行', '程序'],
 ['招股', '说明书', '不', '发行', '股票', '法律', '效力', '仅', '供', '预先', '披露'],
 ['投资者', '正式', '公告', '招股', '说明书', '投资', '依据'],
 ['中国',
  '证监会',
  '交易所',
  '本次',
  '发行',
  '作',
  '意见',
  '均',
  '不',
  '注册',
  '申请',
  '文件',
  '披露',
  '信息',
  '真实性',
  '准确性',
  '完整性',
  '作出',
  '保证',
  '不',
  '发行人',
  '盈利',
  '能力',
  '投资',
  '价值',
  '投资者',
  '收益',
  '作出',
  '实质性',
  '判断',
  '保证'],
 ['声明', '均', '属', '虚假', '不', '实', '陈述'],
 ['证券法',
  '股票',
  '依法',
  '发行',
  '后',
  '发行人',
  '经营',
  '收益',
  '变化',
  '发行人',
  '自行',
  '负责',
  '投资者',
  '自主',
  '判断',
  '发行人',
  '投资',
  '价值',
  '自主',
  '作出',
  '投资',
  '决策',
  '自行',
  '承担',
  '股票',
  '依法',
  '发行',
  '