# 分句 分词

In [25]:
from ltp import LTP
ltp = LTP()
sents = ltp.sent_split(["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
segment, _ = ltp.seg(sents)
segment

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [22]:
sents = ltp.sent_split(["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"])
print(sents)
segment, _ = ltp.seg(sents)
segment

['2021年他叫汤姆去拿外衣']


[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣']]

In [6]:
def tokenizer(text_list):
    sents = ltp.sent_split(text_list)
    segment, _ = ltp.seg(sents)
    return segment

text_list = ["2021年他叫汤姆去拿外衣。", "汤姆生病了。他去了医院。"]
tokenizer(text_list)

[['2021年', '他', '叫', '汤姆', '去', '拿', '外衣', '。'],
 ['汤姆', '生病', '了', '。'],
 ['他', '去', '了', '医院', '。']]

In [36]:
import re
import jieba

def cut_sentence(text):
    sentence_list = re.split(r'(\.|\!|\?|。|！|？|\.{6})', text)
    return sentence_list

text = "2021年他叫汤姆去拿外衣。汤姆生病了。他去了医院。"
sentences = cut_sentence(text)
print(sentences)

for sent in sentences:
    segs = jieba.cut(sent)
    print("Full Mode: ", list(segs))  # 全模式


['2021年他叫汤姆去拿外衣', '。', '汤姆生病了', '。', '他去了医院', '。', '']
Full Mode:  ['2021', '年', '他', '叫', '汤姆', '去', '拿', '外衣']
Full Mode:  ['。']
Full Mode:  ['汤姆生', '病', '了']
Full Mode:  ['。']
Full Mode:  ['他', '去', '了', '医院']
Full Mode:  ['。']
Full Mode:  []


# 去除中英文标点符号

In [None]:
import string
from zhon.hanzi import punctuation

def remove_punctuation(words_list):
    new_words = [word for word in words_list if word not in punctuation and word not in string.punctuation]
    return new_words

# 去除文本中的数字

In [None]:
def remove_numbers(words_list): 
    renum_words = [word for word in words_list if not word.isnumeric()]
    new_words = [word for word in renum_words if not re.findall('-\d+',word)]
    return new_words

In [47]:
import string
from zhon.hanzi import punctuation
import re
import jieba

class Custom_segment:
    def __init__(self):
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        with open('baidu_stopwords.txt','r', encoding='utf-8') as f:
            content = f.readlines()
            stopwords = list(map(str.strip, content))
        return  stopwords

    def cut_sentence(self, text):
        sentence_list = re.split(r'(\.|\!|\?|。|！|？|\.{6})', text)
        return sentence_list

    def tokenizer(self, sent):
        segs = jieba.cut(sent)
        rem_p_segments = self.remove_punctuation(segs)
        rem_sw_segments = self.del_stopwords(rem_p_segments)
        return list(rem_sw_segments)

    def remove_punctuation(self, words):
        new_words = (word for word in words if word not in punctuation and word not in string.punctuation)
        return new_words     

    def del_stopwords(self, words):
        new_words = (word for word in words if word not in self.stopwords)
        return new_words
        
    def __call__(self, text):
        sents = self.cut_sentence(text)
        segments = list(map(self.tokenizer, sents))
        return sents, segments


if __name__=="__main__":
    text = "2021年他叫汤姆去拿《外衣》。汤姆生病了。他去了医院的住院部。"
    custom_segment = Custom_segment()
    sents, segments = custom_segment(text)
    print(f'sents:{sents}, segments:{segments}')
    segments = custom_segment.tokenizer('2021年他叫汤姆去拿《外衣》。')
    print(segments)

sents:['2021年他叫汤姆去拿《外衣》', '。', '汤姆生病了', '。', '他去了医院的住院部', '。', ''], segments:[['2021', '年', '汤姆', '去', '外衣'], [], ['汤姆生', '病'], [], ['去', '医院', '住院部'], [], []]
['2021', '年', '汤姆', '去', '外衣']
