更新切分算法，没有最长字符的强制切分，读句子会丝滑，但是千万要断句啊，怕你电脑吃不消

Ikaros-521 · Sep 13, 2023 · 70f522c · 70f522c
1 parent b5c7f9e
commit 70f522c
Showing 1 changed file with 41 additions and 2 deletions.
diff --git a/utils/common.py b/utils/common.py
@@ -209,8 +209,8 @@ def split_sentences1(self, text):
         return result
 
 
-    # 文本切分算法
-    def split_sentences(self, text):
+    # 文本切分算法 旧算法，有最大长度限制
+    def split_sentences2(self, text):
         # 最大长度限制，超过后会强制切分
         max_limit_len = 40
 
@@ -263,6 +263,45 @@ def split_sentences(self, text):
         return result2
 
 
+    # 文本切分算法
+    def split_sentences(self, text):
+        # 使用正则表达式切分句子
+        sentences = re.split(r'(?<=[。！？!?])', text)
+        result = []
+        current_sentence = ""
+
+        for sentence in sentences:
+            # 去除换行和空格
+            sentence = sentence.replace('\n', '')
+
+            # 如果句子为空则跳过
+            if not sentence:
+                continue
+
+            # 如果句子长度小于10个字，则与下一句合并
+            if len(current_sentence) < 10:
+                current_sentence += sentence
+            else:
+                # 判断当前句子是否以标点符号结尾
+                if current_sentence[-1] in ["。", "！", "？", ".", "!", "?"]:
+                    result.append(current_sentence)
+                    current_sentence = sentence
+                else:
+                    # 如果当前句子不以标点符号结尾，则进行二次切分
+                    split_sentences = re.split(r'(?<=[,，;；])', current_sentence)
+                    if len(split_sentences) > 1:
+                        result.extend(split_sentences[:-1])
+                        current_sentence = split_sentences[-1] + sentence
+                    else:
+                        current_sentence += sentence
+
+        # 添加最后一句
+        if current_sentence:
+            result.append(current_sentence)
+
+        return result
+
+
     # 字符串匹配算法来计算字符串之间的相似度，并选择匹配度最高的字符串作为结果
     def find_best_match(self, substring, string_list, similarity=0.5):
         """字符串匹配算法来计算字符串之间的相似度，并选择匹配度最高的字符串作为结果