In [3]:
import unicodedata
s1 = 'café'
s2 = 'cafe\u0301'

In [2]:
s1 == s2

False

In [4]:
# http://itnan.ru/post.php?c=1&p=579868

In [8]:
# BEGIN SHAVE_MARKS
import unicodedata
import string


def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))  # <2>
    return unicodedata.normalize('NFC', shaved)  # <3>
# END SHAVE_MARKS

# BEGIN SHAVE_MARKS_LATIN
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)  # <1>
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:   # <2>
            continue  # ignore diacritic on Latin base char
        keepers.append(c)                             # <3>
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):              # <4>
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)   # <5>
    # END SHAVE_MARKS_LATIN

# BEGIN ASCIIZE
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""",  # <1>
                           """'f"*^<''""---~>""")

multi_map = str.maketrans({  # <2>
    '€': '<euro>',
    '…': '...',
    'Œ': 'OE',
    '™': '(TM)',
    'œ': 'oe',
    '‰': '<per mille>',
    '‡': '**',
})

multi_map.update(single_map)  # <3>


def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)  # <4>


def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))     # <5>
    no_marks = no_marks.replace('ß', 'ss')          # <6>
    return unicodedata.normalize('NFKC', no_marks)  # <7>
# END ASCIIZE

In [9]:
"""
Radical folding and text sanitizing.
Handling a string with `cp1252` symbols:
    >>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
    >>> shave_marks(order)
    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
    >>> shave_marks_latin(order)
    '“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
    >>> dewinize(order)
    '"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
    >>> asciize(order)
    '"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
Handling a string with Greek and Latin accented characters:
    >>> greek = 'Ζέφυρος, Zéfiro'
    >>> shave_marks(greek)
    'Ζεφυρος, Zefiro'
    >>> shave_marks_latin(greek)
    'Ζέφυρος, Zefiro'
    >>> dewinize(greek)
    'Ζέφυρος, Zéfiro'
    >>> asciize(greek)
    'Ζέφυρος, Zefiro'
"""

'\nRadical folding and text sanitizing.\nHandling a string with `cp1252` symbols:\n    >>> order = \'“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”\'\n    >>> shave_marks(order)\n    \'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”\'\n    >>> shave_marks_latin(order)\n    \'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”\'\n    >>> dewinize(order)\n    \'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."\'\n    >>> asciize(order)\n    \'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."\'\nHandling a string with Greek and Latin accented characters:\n    >>> greek = \'Ζέφυρος, Zéfiro\'\n    >>> shave_marks(greek)\n    \'Ζεφυρος, Zefiro\'\n    >>> shave_marks_latin(greek)\n    \'Ζέφυρος, Zefiro\'\n    >>> dewinize(greek)\n    \'Ζέφυρος, Zéfiro\'\n    >>> asciize(greek)\n    \'Ζέφυρος, Zefiro\'\n'