In [34]:
import re
import unicodedata

def isCharacterPrintable(c: str) -> bool:
  if ord(c) == 0x200B: # Zero-width space
    return False
  if ord(c) == 0x200C: # Zero-width non-joiner
    return False
  if ord(c) == 0x200D: # Zero-width joiner
    return False
  if ord(c) == 0x000A: # Line feed
    return True
  category = unicodedata.category(c)
  if category == 'Cc': # Control characters
    return False
  return True

def strip(text: str) -> str:
  text = text.strip()
  text = re.sub(r'([^\S\n]+)$', '', text)
  return text

def normalizeLineEndings(text: str) -> str:
  text = re.sub(r'\r\n', '\n', text)
  text = re.sub(r'\r', '\n', text)
  return text

def compressWhitespace(text: str) -> str:
  text = re.sub(r'[^\S\n]+', ' ', text)
  return text

def normalizeUnicode(text: str, unicodeNormalizeCategory: str = 'NFKC') -> str:
  text = unicodedata.normalize(unicodeNormalizeCategory, text)
  return text

def clearUnprintable(text: str) -> str:
  text = ''.join(filter(isCharacterPrintable, text))
  return text

def compressCommas(text: str) -> str:
  text = re.sub(r"\s*,[\s,]*,\s*", ", ", text)
  return text

def compressLines(text: str, joiner: str = " ") -> str:
  text = re.sub("\n+", joiner, text)
  return text

def checkOutput(output):
  print('Output: “' + output + "”")
  print('Length:', len(output))
  hexList = [hex(ord(c)) for c in output]
  simplifiedHexList = [c[2:] for c in hexList]
  print('Hex:', ' '.join(simplifiedHexList))
  categoryList = [unicodedata.category(c) for c in output]
  print('Categories:', ' '.join(categoryList))

In [35]:
text = """
  Hello, world!\r\nabc
\t\t
     你好，世界！

 """
checkOutput(text)

Output: “
  Hello, world!
abc
		
     你好，世界！

 ”
Length: 39
Hex: a 20 20 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 d a 61 62 63 a 9 9 a 20 20 20 20 20 4f60 597d ff0c 4e16 754c ff01 a a 20
Categories: Cc Zs Zs Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Cc Ll Ll Ll Cc Cc Cc Cc Zs Zs Zs Zs Zs Lo Lo Po Lo Lo Po Cc Cc Zs


In [36]:
text = normalizeLineEndings(text)
checkOutput(text)

Output: “
  Hello, world!
abc
		
     你好，世界！

 ”
Length: 38
Hex: a 20 20 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 a 61 62 63 a 9 9 a 20 20 20 20 20 4f60 597d ff0c 4e16 754c ff01 a a 20
Categories: Cc Zs Zs Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Ll Ll Ll Cc Cc Cc Cc Zs Zs Zs Zs Zs Lo Lo Po Lo Lo Po Cc Cc Zs


In [37]:
text = strip(text)
checkOutput(text)

Output: “Hello, world!
abc
		
     你好，世界！”
Length: 32
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 a 61 62 63 a 9 9 a 20 20 20 20 20 4f60 597d ff0c 4e16 754c ff01
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Ll Ll Ll Cc Cc Cc Cc Zs Zs Zs Zs Zs Lo Lo Po Lo Lo Po


In [38]:
text = compressWhitespace(text)
checkOutput(text)

Output: “Hello, world!
abc
 
 你好，世界！”
Length: 27
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 a 61 62 63 a 20 a 20 4f60 597d ff0c 4e16 754c ff01
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Ll Ll Ll Cc Zs Cc Zs Lo Lo Po Lo Lo Po


In [39]:
text = normalizeUnicode(text)
checkOutput(text)

Output: “Hello, world!
abc
 
 你好,世界!”
Length: 27
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 a 61 62 63 a 20 a 20 4f60 597d 2c 4e16 754c 21
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Ll Ll Ll Cc Zs Cc Zs Lo Lo Po Lo Lo Po


In [40]:
text = clearUnprintable(text)
checkOutput(text)

Output: “Hello, world!
abc
 
 你好,世界!”
Length: 27
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 a 61 62 63 a 20 a 20 4f60 597d 2c 4e16 754c 21
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Cc Ll Ll Ll Cc Zs Cc Zs Lo Lo Po Lo Lo Po
