In [108]:
import unicodedata
import re

printableUnicodeCategories = {'Lu', 'Ll'}
unicodeNormalizeCategory = 'NFKC'

def checkOutput(output):
  print('Output: “' + output + "”")
  print('Length:', len(output))
  hexList = [hex(ord(c)) for c in output]
  simplifiedHexList = [c[2:] for c in hexList]
  print('Hex:', ' '.join(simplifiedHexList))
  categoryList = [unicodedata.category(c) for c in output]
  print('Categories:', ' '.join(categoryList))

In [109]:
text = "  Hello, world!      你好，世界！"
checkOutput(text)

Output: “  Hello, world!      你好，世界！”
Length: 27
Hex: 20 20 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 20 20 20 20 20 20 4f60 597d ff0c 4e16 754c ff01
Categories: Zs Zs Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Zs Zs Zs Zs Zs Zs Lo Lo Po Lo Lo Po


In [110]:
text = text.strip()
checkOutput(text)

Output: “Hello, world!      你好，世界！”
Length: 25
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 20 20 20 20 20 20 4f60 597d ff0c 4e16 754c ff01
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Zs Zs Zs Zs Zs Zs Lo Lo Po Lo Lo Po


In [111]:
text = re.sub(r'\s+', ' ', text)
checkOutput(text)

Output: “Hello, world! 你好，世界！”
Length: 20
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 20 4f60 597d ff0c 4e16 754c ff01
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Zs Lo Lo Po Lo Lo Po


In [112]:
text = unicodedata.normalize(unicodeNormalizeCategory, text)
checkOutput(text)

Output: “Hello, world! 你好,世界!”
Length: 20
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 20 4f60 597d 2c 4e16 754c 21
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Zs Lo Lo Po Lo Lo Po


In [113]:
def isCharacterPrintable(c: str) -> bool:
  category = unicodedata.category(c)
  if ord(c) == 0x200B: # Zero-width space
    return False
  if ord(c) == 0x200C: # Zero-width non-joiner
    return False
  if ord(c) == 0x200D: # Zero-width joiner
    return False
  if category == 'Cc': # Control characters
    return False
  return True

text = ''.join(filter(isCharacterPrintable, text))
checkOutput(text)


Output: “Hello, world! 你好,世界!”
Length: 20
Hex: 48 65 6c 6c 6f 2c 20 77 6f 72 6c 64 21 20 4f60 597d 2c 4e16 754c 21
Categories: Lu Ll Ll Ll Ll Po Zs Ll Ll Ll Ll Ll Po Zs Lo Lo Po Lo Lo Po
