# Unicode

In [1]:
import sys

sys.getdefaultencoding()

'utf-8'

In [4]:
ord('a'), chr(97), hex(97), oct(97), bin(97)

(97, 'a', '0x61', '0o141', '0b1100001')

In [5]:
chr(196)

'Ä'

In [11]:
S = 'spam'
S.encode('ascii'), S.encode('latin-1')

(b'spam', b'spam')

In [12]:
S.encode('utf-8'), len(S.encode('utf-8'))

(b'spam', 4)

In [13]:
S.encode('utf-16'), len(S.encode('utf-16'))

(b'\xff\xfes\x00p\x00a\x00m\x00', 10)

In [14]:
S.encode('utf-32'), len(S.encode('utf-32'))

(b'\xff\xfe\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00', 20)

In [17]:
import encodings

help(encodings)

Help on package encodings:

NAME
    encodings - Standard "encodings" Package

MODULE REFERENCE
    https://docs.python.org/3.11/library/encodings.html
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
        Standard Python encoding modules are stored in this package
        directory.
    
        Codec modules must have names corresponding to normalized encoding
        names as defined in the normalize_encoding() function below, e.g.
        'utf-8' must be implemented by the module 'utf_8.py'.
    
        Each codec module must export the following interface:
    
        * getregentry() -> codecs.CodecInfo object
        The getregentry() API must return a CodecInfo object with encoder, decoder,

## 字符串

In [18]:
B = b'spam'
S = 'eggs'

In [19]:
type(B), type(S)

(bytes, str)

In [20]:
B, S

(b'spam', 'eggs')

In [22]:
B[0], B[1], B[2:]

(115, 112, b'am')

In [28]:
list(B), list(S)

([115, 112, 97, 109], ['e', 'g', 'g', 's'])

In [25]:
B.upper()

b'SPAM'

In [29]:
B = b"""
xxx
yyy
"""

In [30]:
B

b'\nxxx\nyyy\n'

In [31]:
B.upper()

b'\nXXX\nYYY\n'

In [32]:
U = u'spam'
type(U)

str

• str.encode（） 和 bytes（S,encoding）把⼀个字符串转换为其原始字节形式，并且在 此过程中根据⼀个解码的 str 创建⼀个编码的bytes。

• bytes.decode（）和 str（B,encoding）把原始字节转换为其字符串形式，并且在此过 程中根据⼀个编码的bytes 创建⼀个解码的str。

In [38]:
S = 'eggs'
S.encode('ascii'), S.encode('latin-1'), S.encode('utf-8')

(b'eggs', b'eggs', b'eggs')

In [40]:
bytes(S, encoding='ascii')

b'eggs'

In [41]:
B = b'eggs'
B.decode('ascii'), B.decode('latin-1'), B.decode('utf-8')

('eggs', 'eggs', 'eggs')

In [43]:
str(B, encoding='ascii')

'eggs'

In [44]:
bin(15)

'0b1111'

In [52]:
chr(0xf1)

'ñ'

In [53]:
S = '\xc4\xf1'

In [58]:
S

'Äñ'

In [59]:
S.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)

In [62]:
S.encode('latin-1'), len(S.encode('latin-1'))

(b'\xc4\xf1', 2)

In [63]:
S.encode('utf-8'), len(S.encode('utf-8'))

(b'\xc3\x84\xc3\xb1', 4)

In [56]:
U = '\u00c4\u00f1'

In [57]:
U

'Äñ'

In [87]:
S = 'A\u00c4B\U000000e8C'

In [88]:
len(S)

5

In [102]:
S.encode('latin1'), len(S.encode('latin1'))

(b'A\xc4B\xe8C', 5)

In [103]:
S.encode('utf8'), len(S.encode('utf8'))

(b'A\xc3\x84B\xc3\xa8C', 7)

In [90]:
S.encode('utf-16'), len(S.encode('utf-16'))

(b'\xff\xfeA\x00\xc4\x00B\x00\xe8\x00C\x00', 12)

In [91]:
bytes(S, encoding='utf-8')

b'A\xc3\x84B\xc3\xa8C'

In [92]:
B = b'A\u00c4B\U000000e8C'
B

b'A\\u00c4B\\U000000e8C'

In [100]:
B.decode('latin1')

'A\\u00c4B\\U000000e8C'

In [101]:
B = b'A\xc3\x84B\xc3\xa8C'
B.decode('utf-8')

'AÄBèC'

In [105]:
f'{B}'

"b'A\\xc3\\x84B\\xc3\\xa8C'"

bytes

In [106]:
B = b'abc'
B

b'abc'

In [108]:
B = bytes('abc', encoding='ascii')
B

b'abc'

In [110]:
B = bytes([97, 98, 99])
B

b'abc'

In [112]:
B = 'abc'.encode()
B

b'abc'

In [114]:
S = B.decode()
S

'abc'

In [117]:
B = b'spam'
B.replace('a', 'b')

TypeError: a bytes-like object is required, not 'str'

In [118]:
B.replace(b'a', b'b')

b'spbm'

In [121]:
B = b'spam'
B.replace(bytes('pa', 'utf8'), bytes('b', 'utf8'))

b'sbm'

In [122]:
b'ab' + b'ac'

b'abac'

In [123]:
b'ab'.decode() + 'ac'

'abac'

In [124]:
b'ab' + bytes('ac', 'utf8')

b'abac'

## bytearray

In [127]:
S = 'spam'
C = bytearray(S)

TypeError: string argument without an encoding

In [128]:
S = 'spam'
C = bytearray(S, encoding='ascii')
C

bytearray(b'spam')

In [131]:
D = bytearray(b'abc')

In [132]:
D

bytearray(b'abc')

In [135]:
C[1], C[2:], C[-1]

(112, bytearray(b'am'), 109)

In [134]:
list(C)

[115, 112, 97, 109]

In [136]:
C[0] = 'a'

TypeError: 'str' object cannot be interpreted as an integer

In [138]:
C[0] = b'a'

TypeError: 'bytes' object cannot be interpreted as an integer

In [139]:
C[0] = ord('a')

In [140]:
C

bytearray(b'apam')

In [141]:
C[1] = b'Y'[0]

In [142]:
C

bytearray(b'aYam')

In [143]:
C.append('abc')

TypeError: 'str' object cannot be interpreted as an integer

In [145]:
C.append(b'A')

TypeError: 'bytes' object cannot be interpreted as an integer

In [146]:
C.append(ord('A'))

In [147]:
C

bytearray(b'aYamA')

In [148]:
C.extend(b'abc')

In [149]:
C

bytearray(b'aYamAabc')

In [150]:
C + 'ABC'

TypeError: can't concat str to bytearray

In [151]:
C + b'ABC'

bytearray(b'aYamAabcABC')

# 文件写入

In [165]:
open('temp', 'wb').write(b'abc\n\x1c')

5

In [166]:
open('temp', 'r').read()

'abc\n\x1c'

In [167]:
open('temp', 'rb').read()

b'abc\n\x1c'

In [168]:
open('temp', 'w').write('abc\n\x1c')

5

In [169]:
open('temp', 'r').read()

'abc\n\x1c'

In [171]:
open('temp', 'rb').read()

b'abc\n\x1c'

## 文件

In [182]:
S = 'spamåø'
open('latindata', 'w', encoding='latin-1').write(S)
open('utfdata', 'w', encoding='utf-8').write(S)

6

In [183]:
open('latindata', 'rb').read()

b'spam\xe5\xf8'

In [184]:
open('utfdata', 'rb').read()

b'spam\xc3\xa5\xc3\xb8'

In [186]:
open('latindata', 'r', encoding='latin-1').read()

'spamåø'

In [187]:
open('utfdata', 'r', encoding='utf-8').read()

'spamåø'

In [188]:
X = open('latindata', 'rb').read()
X.decode('latin-1')

'spamåø'

## 去除 BOM

In [189]:
open('bom_ascii.txt', 'rb').read()

b'spam\nspam'

In [191]:
open('bom_ascii.txt', 'r').read()

'spam\nspam'

In [207]:
open('bom_ascii.txt', 'r', encoding='ascii').read()

'spam\nspam'

In [200]:
open('bom_utf8.txt', 'rb').read()

b'\xef\xbb\xbfspam\nspam'

In [201]:
open('bom_utf8.txt', 'r').read()

'\ufeffspam\nspam'

In [202]:
open('bom_utf8.txt', 'r', encoding='utf-8-sig').read()

'spam\nspam'

In [203]:
open('bom_utf16.txt', 'rb').read()

b'\xfe\xff\x00s\x00p\x00a\x00m\x00\n\x00s\x00p\x00a\x00m'

In [204]:
open('bom_utf16.txt', 'r').read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfe in position 0: invalid start byte

In [205]:
open('bom_utf16.txt', 'r', encoding='utf-16').read()

'spam\nspam'

In [206]:
open('bom_utf16.txt', 'r', encoding='utf-16-be').read()

'\ufeffspam\nspam'

`utf-8` 不会写入 BOM，并在读取BOM；但 `utf-8-sig`会写入 BOM，但不读取 BOM

In [230]:
open('temp.txt', 'w', encoding='utf-8').write('spam\nSpam\n')

10

In [231]:
open('temp.txt', 'rb').read()

b'spam\nSpam\n'

In [232]:
open('temp.txt', 'r', encoding='utf-8').read()

'spam\nSpam\n'

In [233]:
open('temp.txt', 'r', encoding='utf-8-sig').read()

'spam\nSpam\n'

In [234]:
open('temp.txt', 'w', encoding='utf-8-sig').write('spam\nSpam\n')

10

In [235]:
open('temp.txt', 'rb').read()

b'\xef\xbb\xbfspam\nSpam\n'

In [236]:
open('temp.txt', 'r', encoding='utf-8').read()

'\ufeffspam\nSpam\n'

In [237]:
open('temp.txt', 'r', encoding='utf-8-sig').read()

'spam\nSpam\n'

## Unicode ⽂件名和流

In [238]:
import sys

sys.getdefaultencoding(), sys.getfilesystemencoding()

('utf-8', 'utf-8')

# struct二进制模块

In [239]:
import struct

In [242]:
struct.pack('>i4s h', 123)

error: pack expected 3 items for packing (got 1)

In [243]:
sys.byteorder

'little'

In [303]:
A = struct.pack('@4s2h5s', b'hel', 123, 79, b'No!')

In [304]:
struct.unpack('@4s2h5s', A)

(b'hel\x00', 123, 79, b'No!\x00\x00')

In [309]:
B = struct.pack('>i4sh0s', 123, b'No', 0, b'!')

In [310]:
struct.unpack('>i4sh0s', B)

(123, b'No\x00\x00', 0, b'')