# <center>流畅的Python</center>
## 第四章：文本和字节序列
**示例4-1 编码和解码**

In [1]:
s = 'café'
print(len(s))

b = s.encode('utf8')
print(b)
print(len(b))

c = b.decode('utf8')
print(c)

4
b'caf\xc3\xa9'
5
café


**示例4-2 包含五个字节的bytes和bytearray对象**

In [2]:
cafe = bytes('café', encoding='utf8')
print(cafe)
print(cafe[0])
print(cafe[:1])

cafe_arr = bytearray(cafe)
print(cafe_arr)
print(cafe_arr[-1:])

b'caf\xc3\xa9'
99
b'c'
bytearray(b'caf\xc3\xa9')
bytearray(b'\xa9')


**示例4-3 使用数组中的原始数据初始化bytes对象**

In [3]:
from array import array

numbers = array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
print(octets)

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'


**示例4-5 使用三个编解码器编码**

In [5]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


**示例4-6 编码成字节序列**

In [8]:
city = 'São Paulo'
print(city.encode('utf-8'))
print(city.encode('utf-16'))
print(city.encode('iso8859-1'))

print('\n')
print(city.encode('cp437', errors='ignore'))
print(city.encode('cp437', errors='replace'))
print(city.encode('cp437', errors='xmlcharrefreplace'))
print(city.encode('cp437'))

b'S\xc3\xa3o Paulo'
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
b'S\xe3o Paulo'


b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

**示例4-7 把字节序列解码成字符串**

In [9]:
octets = b'Montr\xe9al'
print(octets.decode('cp1252'))
print(octets.decode('iso8859_7'))
print(octets.decode('koi8_r'))
print(octets.decode('utf-8', errors='replace'))
print(octets.decode('utf-8'))

Montréal
Montrιal
MontrИal
Montr�al


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

**示例4-8 葡萄牙语版 你好世界**

In [14]:
print('Olá, Mundo!')

Olá, Mundo!


**示例4-9 一个平台上的编码问题**

In [15]:
open('cafe.txt', 'w', encoding='utf-8').write('café')
open('cafe.txt').read()

'café'

**示例4-10 仔细分析编码问题**

In [16]:
fp = open('cafe.txt', 'w', encoding='utf-8')
print(fp)
print(fp.write('café'))
fp.close()

import os
print('\n')
print(os.stat('cafe.txt').st_size)
fp2 = open('cafe.txt')
print(fp2)
print(fp2.encoding)
print(fp2.read())

print('\n')
fp3 = open('cafe.txt', encoding='cp1252')
print(fp3)
print(fp3.read())

print('\n')
fp4 = open('cafe.txt', 'rb')
print(fp4)
print(fp4.read())

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf-8'>
4


5
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='UTF-8'>
UTF-8
café


<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>
cafÃ©


<_io.BufferedReader name='cafe.txt'>
b'caf\xc3\xa9'


**示例4-11 探索编码默认值**

In [19]:
import sys, locale

expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""

my_file = open('dummy', 'w')
for exp in expressions.split():
    value = eval(exp)
    print(exp.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


**café的构成方式**

In [20]:
s1 = 'café'
s2 = 'cafe\u0301'
print((s1, s2))
print(len(s1), len(s2))
print(s1 == s2)

('café', 'café')
4 5
False


**两种规范化**

In [21]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
print((len(s1), len(s2)))
print((len(normalize('NFC', s1)), len(normalize('NFC', s2))))
print((len(normalize('NFD', s1)), len(normalize('NFD', s2))))
print(normalize('NFC', s1) == normalize('NFC', s2))
print(normalize('NFD', s1) == normalize('NFD', s2))

(4, 5)
(4, 4)
(5, 5)
True
True


**NFKC的具体应用**

In [23]:
from unicodedata import normalize, name

symbol1 = '℀'
print((symbol1, normalize('NFKC', symbol1)))
symbol2 = 'µ'
print((symbol2, normalize('NFKC', symbol2)))
print((ord(symbol2), ord(normalize('NFKC', symbol2))))
print((name(symbol2), name(normalize('NFKC', symbol2))))

('℀', 'a/c')
('µ', 'μ')
(181, 956)
('MICRO SIGN', 'GREEK SMALL LETTER MU')


**大小写折叠**

In [27]:
micro = 'µ'
print(name(micro))
micro_cf = micro.casefold()
print(name(micro_cf))
print((micro, micro_cf))

print('\n')
eszett = 'ß'
print(name(eszett))
eszett_cf = eszett.casefold()
print((eszett, eszett_cf))

MICRO SIGN
GREEK SMALL LETTER MU
('µ', 'μ')


LATIN SMALL LETTER SHARP S
('ß', 'ss')


**示例4-13 比较规范化Unicode字符串**

In [28]:
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold()

s1 = 'café'
s2 = 'cafe\u0301'
print((s1 == s2, nfc_equal(s1, s2)))
print(nfc_equal('A', 'a'), '\n')

s3 = 'Straße'
s4 = 'strasse'
print((s3 == s4, nfc_equal(s3, s4)))
print((fold_equal(s1, s2), fold_equal(s3, s4)))
print(fold_equal('A', 'a'))

(False, True)
False 

(False, False)
(True, True)
True


**示例4-14 去掉全部组合记号的函数**

In [35]:
import unicodedata
import string

def shave_marks(txt):
    """去掉全部变音符号"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

**示例4-15 使用示例**

In [36]:
order = '“Herr Voß: · ℀ cup of Đtker caffè latte · bowl of açai”'
print(shave_marks(order))

greek = 'Zℇü'
print(shave_marks(greek))

“Herr Voß: · ℀ cup of Đtker caffe latte · bowl of acai”
Zℇu
