In [1]:
######## DEMOCRATS 2000
import re
from pathlib import Path

def clean_manifesto_final(input_path: str, output_path: str) -> None:
    text = Path(input_path).read_text(encoding="utf-8", errors="ignore")

    # Normalisation de base
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\f", "\n")

    # 1. Supprimer les en-têtes de pages restants du type
    # "2000 Democratic National Platform — 2"
    text = re.sub(
        r"^\s*2000 Democratic National Platform\s*[—-]\s*\d+\s*$",
        "",
        text,
        flags=re.MULTILINE,
    )

    # 2. Supprimer les lignes qui ne contiennent qu'un numéro de page
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)

    # 3. Reconstruction de paragraphes + transformation des titres
    lines = text.split("\n")
    new_lines = []
    buffer = []

    def flush_buffer():
        nonlocal buffer, new_lines
        if buffer:
            paragraph = " ".join(buffer)
            paragraph = re.sub(r"\s{2,}", " ", paragraph)
            new_lines.append(paragraph.strip())
            buffer = []

    for line in lines:
        stripped = line.strip()

        # Ligne vide => fin de paragraphe
        if stripped == "":
            flush_buffer()
            if new_lines and new_lines[-1] != "":
                new_lines.append("")
            continue

        # Lignes full uppercase = titres → Title Case
        if stripped.isupper():
            flush_buffer()
            # Exemple : "PROTECTING AMERICAN CONSUMERS" -> "Protecting American Consumers"
            title = stripped.title()
            new_lines.append(title)
            continue

        # Cas général : texte normal
        buffer.append(stripped)

    # Dernier paragraphe
    flush_buffer()

    # 4. Nettoyage final
    text_clean = "\n".join(new_lines)
    text_clean = re.sub(r"\n{3,}", "\n\n", text_clean).strip() + "\n"

    Path(output_path).write_text(text_clean, encoding="utf-8")


if __name__ == "__main__":
    clean_manifesto_final(
        input_path="/Users/xavierfoidart/Documents/M1/Data Management/Projet/elections-nlp-project/data/raw/manifestos/2000-Democrats.txt",
        output_path="/Users/xavierfoidart/Documents/M1/Data Management/Projet/elections-nlp-project/data/raw/manifestos/2000-Democrats_clean_final.txt",
    )


In [None]:
###### REPUBLICANS 2000
import argparse
import re
from pathlib import Path
from typing import List


# ---------- Heuristics ----------

def remove_toc_heuristic(lines: List[str], max_scan_lines: int = 200) -> List[str]:
    """
    Simple heuristic to remove a Table of Contents at the beginning of a document.

    - Look for 'CONTENTS' or 'TABLE OF CONTENTS' in the first N lines.
    - If found, drop lines from that point until the first 'normal' paragraph.

    'Normal' paragraph = non-empty line with length > 40 and not ending with a page number.
    """
    idx_toc = None
    for i, line in enumerate(lines[:max_scan_lines]):
        stripped = line.strip().lower()
        if stripped in {"contents", "table of contents"}:
            idx_toc = i
            break

    if idx_toc is None:
        return lines  # No TOC detected

    # Find the end of the TOC block
    end_idx = None
    for j in range(idx_toc + 1, min(len(lines), idx_toc + max_scan_lines)):
        stripped = lines[j].strip()
        if (
            stripped
            and len(stripped) > 40
            and not re.search(r"\d+\s*$", stripped)  # avoid entries ending with page numbers
        ):
            end_idx = j
            break

    if end_idx is None:
        # Failsafe: if we don't find a safe restart point, do nothing
        return lines

    # Keep everything before TOC and resume at first normal paragraph after TOC
    return lines[:idx_toc] + lines[end_idx:]


def is_noise(line: str) -> bool:
    """
    Detect obvious OCR garbage / corrupted lines.

    Strategy:
    - Compute ratio of alphabetic characters to total characters.
    - If < 0.5 and line is not short, we drop it.
    """
    text = line.strip()
    if not text:
        return False

    # Very short lines we keep (could be section numbers, etc.)
    if len(text) <= 5:
        return False

    letters = sum(c.isalpha() for c in text)
    ratio = letters / max(1, len(text))

    # Threshold: < 0.5 = likely noisy
    if ratio < 0.5:
        return True

    return False


def is_header_candidate(line: str) -> bool:
    """
    Decide if a line is probably a section header.

    Rules:
    - FULL UPPERCASE shortish lines → header
    - Title Case short-ish lines → header
    - Lines starting with Roman numeral + '.' → header
    - Lines starting with '|.' + text → header
    """
    stripped = line.strip()
    if not stripped:
        return False

    # Rule 1: full uppercase (e.g. "PROSPERITY", "RETIREMENT SECURITY")
    if stripped.isupper() and len(stripped) <= 80:
        return True

    # Rule 2: Title Case and not too long (e.g. "Old Truths For The New Economy")
    if stripped.istitle() and len(stripped.split()) <= 10:
        return True

    # Rule 3: starts with Roman numeral (I., II., III., IV., etc.)
    if re.match(r"^[IVXLC]+\.\s+", stripped):
        return True

    # Rule 4: starts with "|." or similar bullet + text
    if re.match(r"^\|\.\s*[A-Za-z]", stripped):
        return True

    return False


def normalize_header(line: str) -> str:
    """
    Normalize a header line:
    - Strip leading bullets like '|.' or roman numerals like 'III.'
    - Convert to Title Case
    """
    stripped = line.strip()

    # Remove leading roman numeral markers: "III. " -> ""
    stripped = re.sub(r"^[IVXLC]+\.\s+", "", stripped)

    # Remove leading bullet like "|." or similar
    stripped = re.sub(r"^\|\.\s*", "", stripped)

    # Title Case for consistency
    return stripped.title()


# ---------- Core cleaning ----------

def clean_manifesto_text(text: str, remove_toc: bool = True) -> str:
    """
    Clean a manifesto text:
    - Normalize line endings + remove form feeds
    - Optionally remove TOC
    - Remove page headers/footers & pure numeric page lines
    - Remove noisy OCR lines
    - Rebuild paragraphs
    - Detect & normalize headers
    """
    # Normalize line endings and remove form feeds
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\f", "\n")

    lines = text.split("\n")

    if remove_toc:
        lines = remove_toc_heuristic(lines)

    cleaned_lines: List[str] = []

    # Pattern for page header/footer like "... — 4" or "... - 12"
    header_footer_pattern = re.compile(r"^\s*.*\s[—-]\s*\d+\s*$")

    for line in lines:
        stripped = line.strip()

        # Drop pure numeric page numbers (e.g. "12")
        if re.fullmatch(r"\d+", stripped):
            continue

        # Drop typical header/footer lines
        if header_footer_pattern.match(line):
            continue

        # Drop obvious OCR noise
        if is_noise(line):
            continue

        cleaned_lines.append(line)

    # Rebuild paragraphs and treat headers separately
    new_lines: List[str] = []
    buffer: List[str] = []

    def flush_buffer():
        nonlocal buffer, new_lines
        if buffer:
            paragraph = " ".join(buffer)
            paragraph = re.sub(r"\s{2,}", " ", paragraph)
            paragraph = paragraph.strip()
            if paragraph:
                new_lines.append(paragraph)
            buffer = []

    for line in cleaned_lines:
        stripped = line.strip()

        # Blank line → paragraph separator
        if stripped == "":
            flush_buffer()
            if new_lines and new_lines[-1] != "":
                new_lines.append("")  # keep exactly one blank line
            continue

        # Header candidate
        if is_header_candidate(stripped):
            flush_buffer()
            header = normalize_header(stripped)
            new_lines.append(header)
            continue

        # Normal text → accumulate
        buffer.append(stripped)

    # Flush last paragraph
    flush_buffer()

    # Collapse multiple blank lines
    final_text = "\n".join(new_lines)
    final_text = re.sub(r"\n{3,}", "\n\n", final_text).strip() + "\n"

    return final_text


def clean_manifesto_file(input_path: Path, output_path: Path, remove_toc: bool = True) -> None:
    text = input_path.read_text(encoding="utf-8", errors="ignore")
    cleaned = clean_manifesto_text(text, remove_toc=remove_toc)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(cleaned, encoding="utf-8")


def clean_manifesto_folder(input_dir: Path, output_dir: Path, remove_toc: bool = True) -> None:
    """
    Clean all .txt files in input_dir and write cleaned versions to output_dir
    with the same filenames.
    """
    for path in input_dir.glob("*.txt"):
        rel_name = path.name
        out_path = output_dir / rel_name
        print(f"Cleaning {path} -> {out_path}")
        clean_manifesto_file(path, out_path, remove_toc=remove_toc)


# ---------- CLI ----------

def main():
    parser = argparse.ArgumentParser(description="Clean manifesto text files for NLP.")
    parser.add_argument(
        "--input-file",
        type=str,
        help="Path to a single raw manifesto .txt file.",
    )
    parser.add_argument(
        "--output-file",
        type=str,
        help="Output path for the cleaned single file.",
    )
    parser.add_argument(
        "--input-dir",
        type=str,
        help="Directory containing raw manifesto .txt files.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        help="Directory to store cleaned manifesto files.",
    )
    parser.add_argument(
        "--no-toc",
        action="store_true",
        help="Disable heuristic removal of table of contents.",
    )

    args = parser.parse_args()
    remove_toc = not args.no_toc

    # Single-file mode
    if args.input_file and args.output_file:
        clean_manifesto_file(Path(args.input_file), Path(args.output_file), remove_toc=remove_toc)

    # Batch mode
    elif args.input_dir and args.output_dir:
        clean_manifesto_folder(Path(args.input_dir), Path(args.output_dir), remove_toc=remove_toc)

    else:
        raise SystemExit(
            "Specify either:\n"
            "  --input-file and --output-file\n"
            "or\n"
            "  --input-dir and --output-dir"
        )


if __name__ == "__main__":
    main()
