In [1]:
INPUT_DIR = 'data'
OUTPUT_DIR = 'data_mod'

In [2]:
import os

In [4]:
import re

def process_text(text: str) -> str:
    # 1. Replace all variants of dashes with a hyphen
    text = re.sub(r'[‐‑–—―]', '-', text)
    
    # 2. Replace all whitespace characters (except for newlines) with a space
    text = re.sub(r'[ \t\r\f\v\u2002\u2005\u2008\u200a\u200b\u3000\ufeff]', ' ', text)

    # 3. Replace all variants of quotes with a double quote (")
    text = re.sub(r'[“”„«»‹›]', '"', text)

    # 4. Replace all variants of apostrophes with a single quote (')
    text = re.sub(r'[‘’‚`´ʼʾ]', "'", text)
    
    # 5. Keep only English, Russian alphabets, basic punctuation, and numbers
    allowed_chars = re.compile(r'[^A-Za-zА-Яа-яЁё0-9\n!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~ ]')
    text = allowed_chars.sub(' ', text)
    text = re.sub(r'[ ]+', ' ', text)    
    return text


In [5]:
import os

def process_all_files(INP_DIR: str, OUT_DIR: str):
    for root, dirs, files in os.walk(INP_DIR):
        for file in files:
            if file.endswith(".txt"):
                # Full path of the input file
                inp_file_path = os.path.join(root, file)

                # Calculate corresponding output path
                relative_path = os.path.relpath(root, INP_DIR)
                out_dir_path = os.path.join(OUT_DIR, relative_path)
                os.makedirs(out_dir_path, exist_ok=True)
                out_file_path = os.path.join(out_dir_path, file)

                # Read, process, and save content
                with open(inp_file_path, "r", encoding="utf-8") as inp_file:
                    content = inp_file.read()

                processed_content = process_text(content)

                with open(out_file_path, "w", encoding="utf-8") as out_file:
                    out_file.write(processed_content)


In [6]:

process_all_files(INPUT_DIR, OUTPUT_DIR)

In [7]:
chars = set()

for root, dirs, files in os.walk(OUTPUT_DIR):
    for file in files:
        with open(os.path.join(root, file), 'r') as inp:
            for char in inp.read():
                chars.add(char)

In [8]:
' '.join(sorted(list(chars)))

'\n   ! " # $ % & \' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \\ ] ^ _ a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ Ё А Б В Г Д Е Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я ё'