## Characters

In [2]:
# Encoding and decoding

s = "café"
print(len(s))
b = s.encode("utf8")
print(b)
print(len(b))
b.decode("utf8")

4
b'caf\xc3\xa9'
5


'café'

## Bytes

In [3]:
# bytes and bytearray

cafe = bytes("café", encoding="utf8")
print(cafe)
print(cafe[0])
print(cafe[:1])
cafe_arr = bytearray(cafe)
print(cafe_arr)
cafe_arr[-1:]

b'caf\xc3\xa9'
99
b'c'
bytearray(b'caf\xc3\xa9')


bytearray(b'\xa9')

In [4]:
# Initializing bytes from the raw data of an array

import array

numbers = array.array("h", [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

### Structs and memory views

In [None]:
# Using memoryview and struct to inspect a GIF image header

import struct

fmt = "<3s3sHH"
with open("filter.gif", "rb") as fp:
    img = memoryview(fp.read())
header = img[:10]
print(bytes(header))
print(struct.unpack(fmt, header))
del header
del img

### Basic Encoders/decoders

In [2]:
# Same string encoded with three different codecs

for codec in ["latin_1", "utf_8", "utf_16"]:
    print(codec, "El Nin~o".encode(codec), sep="\t")

latin_1	b'El Nin~o'
utf_8	b'El Nin~o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00n\x00~\x00o\x00'


Understanding Encode/Decode problems

### Handling text bytes
Use the unicode sandwich

Never use default encoding

In [1]:
# Never rely on default encoders
open("cafe.txt", "w", encoding="utf_8").write("café")
open("cafe.txt").read()

'cafÃ©'

In [2]:
fp = open("cafe.txt", "w", encoding="utf_8")
print(fp)
fp.write("café")
fp.close()
import os

os.stat("cafe.txt").st_size
fp2 = open("cafe.txt")
print(fp2)
print(fp2.encoding)
print(fp2.read())
fp3 = open("cafe.txt", encoding="utf_8")
print(fp3.read())
fp4 = open("cafe.txt", "rb")
print(fp4)
fp4.read()

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>
cp1252
cafÃ©
café
<_io.BufferedReader name='cafe.txt'>


b'caf\xc3\xa9'

Defaults encodings: do not rely on them

In [3]:
import sys, locale

expressions = """
locale.getpreferredencoding()
type(my_file)
my_file.encoding
sys.stdout.isatty()
sys.stdout.encoding
sys.stdin.isatty()
sys.stdin.encoding
sys.stderr.isatty()
sys.stderr.encoding
sys.getdefaultencoding()
sys.getfilesystemencoding()
"""

my_file = open("dummy", "w")
for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), "->", repr(value))

 locale.getpreferredencoding() -> 'cp1252'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp1252'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


#### Normalizing unicode

In [5]:
# The same unicode can be represented in different ways

s1 = 'café'
s2 = 'caf\u00E9'
print((s1,s2))
print((len(s1),len(s2)))
s1 == s2

('café', 'café')
(4, 4)


True

In [1]:
# Normalizing

from unicodedata import normalize, name
ohm = '\u2126'
print(name(ohm))
ohm_c = normalize('NFC', ohm)
print(name(ohm_c))
print(ohm == ohm_c)
normalize('NFC', ohm) == normalize('NFC', ohm_c)

OHM SIGN
GREEK CAPITAL LETTER OMEGA
False


True