Character issues, Unicode

In [1]:
s = 'cafe'
len(s)

4

In [2]:
b = s.encode('utf-8')
b

b'cafe'

In [3]:
len(b)

4

In [4]:
b.decode('utf-8')

'cafe'

In [21]:
cafe = bytes('cafe', encoding='utf-8')
cafe

b'cafe'

In [8]:
cafe[0]

99

In [14]:
cafe[2]

102

bytes: immutable  
bytearray: mutable

In [20]:
bytes.fromhex('63616665')

b'cafe'

In [19]:
b'cafe'.hex()

'63616665'

In [22]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'cafe')

In [23]:
cafe_arr[-1:]

bytearray(b'e')

In [24]:
cafe_arr[1]

97

In [25]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])  #short integers
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [26]:
import struct
fmt = '<3s3sHH'   # < little-endian, 3s3s 3 bytes, HH 16bit integers
with open('wizard.gif', 'rb') as fp:
    img = memoryview(fp.read())  

header  = img[:10]
bytes(header)

b'GIF89a\x90\x01^\x01'

In [27]:
struct.unpack(fmt, header)

(b'GIF', b'89a', 400, 350)

In [28]:
del header, img

Basic encoders/decoders and errors include UnicodeEncodeError, UnicodeDecodeError, SyntaxError, UnicodeError

In [29]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [30]:
city = 'São Paulo'
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [31]:
city.encode('utf-16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [32]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [33]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [34]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [35]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [36]:
octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [37]:
octets.decode('utf-8', errors='replace')

'Montr�al'

Handle text files

In [39]:
open('cafe.txt', 'w', encoding='utf_8').write('café')

4

In [40]:
open('cafe.txt').read()

'café'

In [41]:
fp = open('cafe.txt', 'w')

In [42]:
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='UTF-8'>

In [43]:
fp.close()

In [44]:
fp.encoding

'UTF-8'

In [54]:
#Encoding defaults: a madhouse
#default_encodings.py 
import sys
import locale

expressions = """
    locale.getpreferredencoding()
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""

for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))
    

 locale.getpreferredencoding() -> 'UTF-8'
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


Normalizing Unicode

In [47]:
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2

('café', 'café')

In [48]:
len(s1), len(s2)

(4, 5)

In [49]:
s1 == s2

False

In [50]:
from unicodedata import normalize
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [51]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [52]:
normalize('NFC', s1) == normalize('NFC', s2) 

True

In [53]:
normalize('NFD', s1) == normalize('NFD', s2) 

True

In [55]:
from unicodedata import normalize, name
ohm = '\u2126'
name(ohm)

'OHM SIGN'

In [57]:
ohm_c = normalize('NFC', ohm)
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [58]:
ohm == ohm_c

False

In [59]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [61]:
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() == 
           normalize('NFC', str2).casefold())

s1 = 'café'
s2 = 'cafe\u0301'
nfc_equal(s1, s2)

True

In [2]:
#santize.py
import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

def shave_marks_latin(txt):
    """Remove all diacritic marks from latin base characters"""
    norm_txt = unicodedata.normlize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_lettters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

order = '“Herr Voß: • 1⁄2 cup of ŒtkerTM caffè latte • bowl of açaí.”'
shave_marks(order)

'“Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai.”'

In [6]:
if True and True:
    print('done')

done
