# 分句 分词

In [3]:
from ltp import LTP
ltp = LTP()
sents = ltp.sent_split(["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
segment, _ = ltp.seg(sents)
segment

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [6]:
def tokenizer(text_list):
    sents = ltp.sent_split(text_list)
    segment, _ = ltp.seg(sents)
    return segment

text_list = ["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]
tokenizer(text_list)

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [1]:
import re
import jieba

def cn_cut_sentence(text):
    sentence_list = re.split(r'(\.|\!|\?|。|！|？|\.{6})', text)
    return sentence_list

text = "2021年他叫汤姆去拿外衣。汤姆生病了。他去了医院。"
sentences = cn_cut_sentence(text)
print(sentences)

for sent in sentences:
    seg_list = list(jieba.cut(sent))
    print("Full Mode: ", seg_list)  # 全模式


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache


['2021年他叫汤姆去拿外衣', '。', '汤姆生病了', '。', '他去了医院', '。', '']


Loading model cost 0.863 seconds.
Prefix dict has been built successfully.


Full Mode:  ['2021', '年', '他', '叫', '汤姆', '去', '拿', '外衣']
Full Mode:  ['。']
Full Mode:  ['汤姆生', '病', '了']
Full Mode:  ['。']
Full Mode:  ['他', '去', '了', '医院']
Full Mode:  ['。']
Full Mode:  []


# 去除中英文标点符号

In [None]:
import string
from zhon.hanzi import punctuation

def remove_punctuation(words_list):
    new_words = [word for word in words_list if word not in punctuation and word not in string.punctuation]
    return new_words

# 去除文本中的数字

In [None]:
def remove_numbers(words_list): 
    renum_words = [word for word in words_list if not word.isnumeric()]
    new_words = [word for word in renum_words if not re.findall('-\d+',word)]
    return new_words

In [46]:
import string
from zhon.hanzi import punctuation
from ltp import LTP
ltp = LTP()

class Custom_Segment:

    def remove_punctuation(self, words_list):
        new_words = [word for word in words_list if word not in punctuation and word not in string.punctuation]
        return new_words 
    
    def sentence_segment(self, text):
        text_list = [text]
        sents = ltp.sent_split(text_list)
        return sents 

    def segment(self, sents):
        segments, _ = ltp.seg(sents)
        rem_p_segments = list(map(self.remove_punctuation, segments))
        return rem_p_segments
        
    def __call__(self, text):
        text_list = [text]
        sents = ltp.sent_split(text_list)
        segments = self.segment(sents)
        return sents, segments


if __name__=="__main__":
    text = "2021年他叫汤姆去拿《外衣》。汤姆生病了。他去了医院。"
    custom_segment = Custom_Segment()
    sents, segments = custom_segment(text)
    print(f'sents:{sents}, segments:{segments}')
    segments = custom_segment.segment(['2021年他叫汤姆去拿《外衣》。', '汤姆病了。', '他去了医院。'])
    print(segments)

sents:['2021年他叫汤姆去拿《外衣》。', '汤姆生病了。', '他去了医院。'], segments:[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '生病', '了'], ['他', '去', '了', '医院']]
[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣'], ['汤姆', '病', '了'], ['他', '去', '了', '医院']]


In [3]:
contents_table = 'cccccccc.xls'
contents_table.endswith('.xls')

True