## 3.1 instance

In [3]:
from collections import abc

my_dict = {}
isinstance(my_dict, abc.Mapping)

True

In [4]:
# various ways of creating a new dictionary

a = dict(one = 1, two = 2, three = 3)
b = {'one':1, 'two':2, 'three':3}
c = dict(zip(['one', 'two', 'three'], [1, 2, 3]))
d = dict([('two',2), ('one', 1), ('three', 3)])
e = dict({'three':3, 'two':2, 'one':1})
a == b == c == d == e

True

## 3.2 Diccomp

In [6]:
DIAL_CODES = [
    (86, 'China'),
    (91, 'India'),
    (1, 'United States'),
    (62, 'Indonesia'),
    (55, 'Brazil'),
    (92, 'Pakistan'),
    (880, 'Bangladesh'),
    (234, 'Nigeria'),
    (7, 'Russia'),
    (81, 'Japan'),
]

country_code = {country:code for code, country in DIAL_CODES}
print(country_code)
print()
{code: country.upper() for country, code in country_code.items() if code <65}

{'China': 86, 'India': 91, 'United States': 1, 'Indonesia': 62, 'Brazil': 55, 'Pakistan': 92, 'Bangladesh': 880, 'Nigeria': 234, 'Russia': 7, 'Japan': 81}



{1: 'UNITED STATES', 62: 'INDONESIA', 55: 'BRAZIL', 7: 'RUSSIA'}

In [7]:
# create a word to the frequency of its value

import sys
import re
WORD_RE = re.compile(r'\w+')
index = {}
with open(sys.argv[1], encoding='utf-8') as fp:
    for line_no, line in enumerate(fp, 1):
        for match in WORD_RE.finditer(line):
            word = match.group()
            column_no = match.start() + 1
            location = (line_no, column_no)
            # this way isn't good
            occurrences = index.get(word, [])
            occurrences.append(location)
            index[word] = occurrences
            # print the result based on the sequence of alpha
for word in sorted(index, key = str.upper):
    print(word, index[word])

FileNotFoundError: [Errno 2] No such file or directory: '-f'

## 3.7 不可变映射类型

In [1]:
from types import MappingProxyType

d = {1:'A'}
d_proxy = MappingProxyType(d)
d_proxy

mappingproxy({1: 'A'})

In [2]:
d_proxy[1]

'A'

In [3]:
d_proxy[2]

KeyError: 2

In [4]:
d[2] = 'B'

In [5]:
d_proxy

mappingproxy({1: 'A', 2: 'B'})

In [6]:
d[2]

'B'

## 3.8.2 集合推导

In [8]:
from unicodedata import name

{chr(i) for i in range(32, 256) if 'SIGN' in name(chr(i), '')}

{'#',
 '$',
 '%',
 '+',
 '<',
 '=',
 '>',
 '¢',
 '£',
 '¤',
 '¥',
 '§',
 '©',
 '¬',
 '®',
 '°',
 '±',
 'µ',
 '¶',
 '×',
 '÷'}

## 4.1 字节概要

In [11]:
# 4.1 instance, encoding and decoding
s = 'café'
print(len(s))
b = s.encode('utf8')
print(b)
print(len(b))
b.decode('utf8')

4
b'caf\xc3\xa9'
5


'café'

In [18]:
# 4.2 bytes and bytearray object including 5 chars

cafe = bytes('café', encoding='utf_8')
print(cafe)
print(cafe[0])
print(cafe[1])
print(cafe[2])
print(cafe[:1])

cafe_arr = bytearray(cafe)
print(cafe_arr)
cafe_arr[-1:]

b'caf\xc3\xa9'
99
97
102
b'c'
bytearray(b'caf\xc3\xa9')


bytearray(b'\xa9')

In [19]:
# 4-3 use the original data in the array initial bytes object
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

## 4.3 基本的编解码器

In [21]:
# 4.5 using 3 different en/decoder encode string "El Niño", the output char sequences
# are also different
for codec in ['latin_1', 'utf-8', 'utf-16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf-8	b'El Ni\xc3\xb1o'
utf-16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [22]:
# 4.6 encoding to char sequence, success and failed
city = "São Paulo"
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [23]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [24]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [25]:
city.encode('cp437')

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [28]:
city.encode('cp437', errors='ignore')

b'So Paulo'

In [29]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [31]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [34]:
# 4.7 decoding chars, success and failed
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [35]:
octets.decode('iso8859_7')

'Montrιal'

In [36]:
octets.decode('koi8_r')

'MontrИal'

In [37]:
octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [38]:
octets.decode('utf_16')

'潍瑮\ue972污'

In [39]:
octets.decode('utf_8', errors="replace")

'Montr�al'

In [40]:
octets.decode('utf_16', errors="replace")

'潍瑮\ue972污'

In [41]:
# 4-8 ola.py, "hello world" in Portugal
# coding: cp1252
print('olá, Mundo!')

olá, Mundo!


In [42]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
print(len(s1), len(s2))

4 5


In [43]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [44]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [45]:
normalize('NFC', s1) == normalize('NFC', s2)

True

In [46]:
normalize('NFD', s1) == normalize('NFD', s2)

True

In [47]:
n1 = normalize('NFC', s1)
n2 = normalize('NFC', s2)
print(n1, n2)

café café


In [48]:
n1 = normalize('NFD', s1)
n2 = normalize('NFD', s2)
print(n1, n2)

café café


In [51]:
from unicodedata import normalize, name
ohm = '\u2126'
print(name(ohm))
ohm_c = normalize('NFC', ohm)
print(name(ohm_c))
print(ohm == ohm_c)
print(normalize('NFC', ohm) == normalize('NFC', ohm_c))

OHM SIGN
GREEK CAPITAL LETTER OMEGA
False
True


In [53]:
n1 = normalize('NFC', ohm)
n2 = normalize('NFC', ohm_c)
print(n1, n2)
print(name(n1), name(n2))

Ω Ω
GREEK CAPITAL LETTER OMEGA GREEK CAPITAL LETTER OMEGA


In [58]:
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

In [68]:
import unicodedata

# txt = "São Paulo"
txt = "café"
norm_txt = unicodedata.normalize('NFD', txt)
print(norm_txt)
for c in norm_txt:
    print(c)

café
c
a
f
e
́


In [69]:
for c in norm_txt:
    p = unicodedata.combining(c)
    print(p)

0
0
0
0
230
