Skip to content

Commit

Permalink
更新切分算法,没有最长字符的强制切分,读句子会丝滑,但是千万要断句啊,怕你电脑吃不消
Browse files Browse the repository at this point in the history
  • Loading branch information
Ikaros-521 committed Sep 13, 2023
1 parent b5c7f9e commit 70f522c
Showing 1 changed file with 41 additions and 2 deletions.
43 changes: 41 additions & 2 deletions utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ def split_sentences1(self, text):
return result


# 文本切分算法
def split_sentences(self, text):
# 文本切分算法 旧算法,有最大长度限制
def split_sentences2(self, text):
# 最大长度限制,超过后会强制切分
max_limit_len = 40

Expand Down Expand Up @@ -263,6 +263,45 @@ def split_sentences(self, text):
return result2


# 文本切分算法
def split_sentences(self, text):
# 使用正则表达式切分句子
sentences = re.split(r'(?<=[。!?!?])', text)
result = []
current_sentence = ""

for sentence in sentences:
# 去除换行和空格
sentence = sentence.replace('\n', '')

# 如果句子为空则跳过
if not sentence:
continue

# 如果句子长度小于10个字,则与下一句合并
if len(current_sentence) < 10:
current_sentence += sentence
else:
# 判断当前句子是否以标点符号结尾
if current_sentence[-1] in ["。", "!", "?", ".", "!", "?"]:
result.append(current_sentence)
current_sentence = sentence
else:
# 如果当前句子不以标点符号结尾,则进行二次切分
split_sentences = re.split(r'(?<=[,,;;])', current_sentence)
if len(split_sentences) > 1:
result.extend(split_sentences[:-1])
current_sentence = split_sentences[-1] + sentence
else:
current_sentence += sentence

# 添加最后一句
if current_sentence:
result.append(current_sentence)

return result


# 字符串匹配算法来计算字符串之间的相似度,并选择匹配度最高的字符串作为结果
def find_best_match(self, substring, string_list, similarity=0.5):
"""字符串匹配算法来计算字符串之间的相似度,并选择匹配度最高的字符串作为结果
Expand Down

0 comments on commit 70f522c

Please sign in to comment.