In [None]:
import re

_SENT_SPLIT_RE = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9"“(\[])')

def normalize_text(input_path: str, output_path: str) -> None:
    """Normalize text in a file according to specific rules.

    Args:
        input_path (str): Path to the input text file.
        output_path (str): Path to the output text file.
    """
    def split_into_sentences(paragraph: str):
        paragraph = re.sub(r'\s+', ' ', paragraph).strip()
        parts = _SENT_SPLIT_RE.split(paragraph)
        return [p.strip() for p in parts if p.strip()]

    with open(input_path, "r", encoding="utf-8") as f:
        raw_lines = f.readlines()

    lines = [ln.rstrip("\n") for ln in raw_lines]

    result = []
    normal_buffer = []
    bullet_buffer = []
    in_bullet = False

    bullet_pattern = re.compile(r'^[ \t]*●')

    def flush_normal():
        nonlocal normal_buffer, result
        if not normal_buffer:
            return
        merged = " ".join(normal_buffer)
        merged = re.sub(r'\s+', ' ', merged).strip()
        for sent in split_into_sentences(merged):
            if sent.endswith("?"):
                if result and result[-1] != "":
                    result.append("")
                result.append(sent)
            else:
                result.append(sent)
        normal_buffer = []

    def flush_bullet():
        nonlocal bullet_buffer, in_bullet, result
        if not bullet_buffer:
            in_bullet = False
            return
        merged = " ".join(bullet_buffer)
        merged = re.sub(r'\s+', ' ', merged).strip()
        if merged.endswith("?"):
            if result and result[-1] != "":
                result.append("")
            result.append(merged)
        else:
            result.append(merged)
        bullet_buffer = []
        in_bullet = False

    for raw in lines:
        if not raw.strip():
            continue

        has_leading_space = bool(re.match(r'^[ \t]+', raw))
        is_bullet_start = bool(bullet_pattern.match(raw))

        if is_bullet_start:
            if in_bullet:
                flush_bullet()
            else:
                flush_normal()
            bullet_buffer = [raw.lstrip()]
            in_bullet = True
            continue

        if in_bullet:
            if raw.strip().endswith("?"):
                flush_bullet()
                normal_buffer.append(raw.strip())
                flush_normal()
                in_bullet = False
                continue

            if not has_leading_space:
                flush_bullet()
                normal_buffer.append(raw.strip())
                in_bullet = False
                continue

            bullet_buffer.append(raw.strip())
            continue

        normal_buffer.append(raw.strip())

    if in_bullet:
        flush_bullet()
    flush_normal()

    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n".join(result))


In [17]:
normalize_text("raw.txt", "resource.txt")