In [22]:
import re

def process_corpus(corpus):
    chapters = []
    current_chapter = []
    
    for line in corpus.splitlines():
        line = line.strip()  # 去除行首尾空白字符
        if not line:  # 如果是空行，跳过
            continue
        
        # 检查是否是新篇章的开始
        if line.startswith("译文：") or ("卷" in line):
            if current_chapter:  # 如果当前篇章不为空，保存当前篇章
                chapters.append("\n".join(current_chapter))
                current_chapter = []  # 重置当前篇章

        if line.startswith("译文：") or ("卷" in line):
            continue
        else:
            current_chapter.append(line)  # 添加当前行到当前篇章

    # 添加最后一个篇章（如果有）
    if current_chapter:
        chapters.append("\n".join(current_chapter))
    
    return chapters

def split_into_sentences(paragraph):
    # 使用正则表达式拆分句子
    sentences = re.split(r'(?<=[。！？])\s*|(?<=：)\s*', paragraph)
    return [sentence.strip() for sentence in sentences if sentence.strip()]  # 去除空句子

def process_and_number_sentences(corpus):
    chapters = process_corpus(corpus)
    numbered_chapter = []

    for i, chapter in enumerate(chapters):
        paragraphs = chapter.splitlines()  # 将篇章按行分割为段落
        numbered_sentences = []
        for paragraph in paragraphs:
            if paragraph.strip():  # 确保段落不为空
                sentences = split_into_sentences(paragraph)
                # 将句子编号从1开始，并将它们连接成一行
                numbered_sentences.append(
                    "\t".join([f"{j + 1}:{sentence}" for j, sentence in enumerate(sentences)])
                )
        numbered_chapter.append("\n".join(numbered_sentences))

    return numbered_chapter

# 从文件中读取语料
def read_corpus_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# 将结果写入文件
def write_results_to_file(numbered_sentences, output_file_path):
    chapter_count = 1
    with open(output_file_path, 'w+', encoding='utf-8') as file:
        for sentences in numbered_sentences:
            file.write(f"篇章 {chapter_count}:\n")
            file.write(f"{sentences}\n")  # 将所有句子写在同一行
            chapter_count += 1
        file.write("\n")  # 每个篇章之间空一行

# 示例文件路径
input_file_path = '二十四史纯译文版.txt'
output_file_path = 'processed_output_v6.txt'

# 读取语料并处理
corpus = read_corpus_from_file(input_file_path)
numbered_sentences = process_and_number_sentences(corpus)

# 将结果写入文件
write_results_to_file(numbered_sentences, output_file_path)
