In [None]:
# BEGIN FLAGS_PY
import os
import time
import sys

import requests  # <1>

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>


def save_flag(img, filename):  # <5>
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)


def get_flag(cc):  # <6>
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc=cc.lower())
    resp = requests.get(url)
    return resp.content


def show(text):  # <7>
    print(text, end=' ')
    sys.stdout.flush()


def download_many(cc_list):  # <8>
    for cc in sorted(cc_list):  # <9>
        image = get_flag(cc)
        show(cc)
        save_flag(image, cc.lower() + '.gif')

    return len(cc_list)


def main(download_many):  # <10>
    t0 = time.time()
    count = download_many(POP20_CC)
    elapsed = time.time() - t0
    msg = '\n{} flags downloaded in {:.2f}s'
    print(msg.format(count, elapsed))


if __name__ == '__main__':
    main(download_many)  # <11>
# END FLAGS_PY


In [None]:
'''
    使用单线程下载
    
'''

import os
import requests

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>

def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)
    

def get_flag(cc):
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc = cc.lower())
    resp = requests.get(url)
    return resp.content



def download_flags(cc_list):
    for cc in cc_list:
        img = get_flag(cc)
        save_flag(img, cc.lower() + '.gif')
    return len(cc_list)

def main():
    import time
    t0 = time.time()
    download_flags(POP20_CC)
    t1 = time.time() - t0
    print('downloud time is {:.2f}s'.format(t1))


if __name__=='__main__':
    main()


In [None]:
'''
    使用多线程
'''

import os
import requests

from concurrent import futures

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>

def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)
    

def get_flag(cc):
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc = cc.lower())
    resp = requests.get(url)
    return resp.content


def download_flags(cc_list):
    for cc in cc_list:
        img = get_flag(cc)
        save_flag(img, cc.lower() + '.gif')
    return len(cc_list)


def download_one(cc):
    img = get_flag(cc)
    save_flag(img, cc.lower() + '.gif')


def download_many_conc(cc_list):
    MAX_WORKERS = 20
    workers = min(MAX_WORKERS, len(cc_list))
    with futures.ThreadPoolExecutor(workers) as executor:
        res = executor.map(download_one, sorted(cc_list))
    return len(list(res))


def main(download_flags_func):
    import time
    t0 = time.time()
    download_flags_func(POP20_CC)
    t1 = time.time() - t0
    print('downloud time is {:.2f}s'.format(t1))


if __name__=='__main__':
    main(download_many_conc)

In [None]:
'''
为了从实用的角度理解future，我们可以使用concurrent.futures.as_completed函数重写示例17-3。
这个函数的参数是一个future列表，返回值是一个迭代器，在future运行结束后产出future。
'''

import os
import requests
import time
from concurrent import futures

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>

def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)
    

def get_flag(cc):
    url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc = cc.lower())
    resp = requests.get(url)
    return resp.content


def download_flags(cc_list):
    for cc in cc_list:
        img = get_flag(cc)
        save_flag(img, cc.lower() + '.gif')
    return len(cc_list)


def download_one(cc):
    img = get_flag(cc)
    save_flag(img, cc.lower() + '.gif')
    return cc


def download_many_conc(cc_list):
    MAX_WORKERS = 20
    workers = min(MAX_WORKERS, len(cc_list))
    with futures.ThreadPoolExecutor(workers) as executor:
        res = executor.map(download_one, sorted(cc_list))
    return len(list(res))


def download_many_conc2(cc_list):
    cc_list = cc_list[:5]
    with futures.ThreadPoolExecutor(max_workers=3) as executor:
        to_do = []
        for cc in sorted(cc_list):
            future = executor.submit(download_one, cc)
            to_do.append(future)
            msg = 'Scheduled for {}: {}'
            print(msg.format(cc, future))
        results = []
        # time.sleep(10.0)
        for future in futures.as_completed(to_do):
            res = future.result()
            msg = '{} result: {!r}'
            print( msg.format(future, res))
            results.append(res)
    return len(results)


def main(download_flags_func):

    t0 = time.time()
    download_flags_func(POP20_CC)
    t1 = time.time() - t0
    print('downloud time is {:.2f}s'.format(t1))


if __name__=='__main__':
    main(download_many_conc2)

In [None]:
from concurrent import futures

MAX_WORKERS = 20

def downloud_one(cc):
    image = get_flag(cc)
    show(cc)
    save_flag(image, cc.lower() + ".gif")
    return cc

# ++++++++++++++++++++++++++++++++++++++++
def download_many_cc(cc_list):
    workers = min(MAX_WORKERS, len(cc_list))
    with futures.ThreadPoolExecutor(workers) as executor :
        res = executor.map(downloud_one, sorted(cc_list))
    
    return len(list(res))
# ----------------------------------------

def main(download_many):  # <10>
    t0 = time.time()
    count = download_many(POP20_CC)
    elapsed = time.time() - t0
    msg = '\n{} flags downloaded in {:.2f}s'
    print(msg.format(count, elapsed))


if __name__ == '__main__':
    main(download_many_cc)  # <11>

做个简单的比喻：进程（process）=火车，线程（thread）=车厢

- 线程在进程下行进（单纯的车厢无法运行）
- 一个进程可以包含多个线程（一辆火车可以有多个车厢）
- 不同进程间数据很难共享（一辆火车上的乘客很难换到另外一辆火车，比如站点换乘）
- 同一进程下不同线程间数据很易共享（A车厢换到B车厢很容易）
- 进程要比线程消耗更多的计算机资源（采用多列火车相比多个车厢更耗资源）
- 进程间不会相互影响，一个线程挂掉将导致整个进程挂掉（一列火车不会影响到另外一列火车，但是如果一列火车上中间的一节车厢着火了，将影响到所有车厢）
- 进程可以拓展到多机，进程最多适合多核（不同火车可以开在多个轨道上，同一火车的车厢不能在行进的不同的轨道上）
- 进程使用的内存地址可以上锁，即一个线程使用某些共享内存时，其他线程必须等它结束，才能使用这一块内存。（比如火车上的洗手间）－"互斥锁"
- 进程使用的内存地址可以限定使用量（比如火车上的餐厅，最多只允许多少人进入，如果满了需要在门口等，等有人出来了才能进去）－“信号量”


for循环中的enumerate函数会隐式调用next(results)，这个函数又会在（内部）表示第一个任务（loiter(0)）的_f future上调用_f.result（　）方法。result方法会阻塞，直到future运行结束，因此这个循环每次迭代时都要等待下一个结果做好准备。

严格来说，我们目前测试的并发脚本都不能并行下载。使用concurrent.futures库实现的那两个示例受GIL（GlobalInterpreter Lock，全局解释器锁）的限制，而fags_asyncio.py脚本在单个线程中运行。读到这里，你可能会对前面做的非正规基准测试有下述疑问。

- 既然Python线程受GIL的限制，任何时候都只允许运行一个线程，那么fags_threadpool.py脚本的下载速度怎么会比fags.py脚本快5倍？
- fags_asyncio.py脚本和fags.py脚本都在单个线程中运行，前者怎么会比后者快5倍？

第一个问题：

CPython解释器本身就不是线程安全的，因此有全局解释器锁（GIL），一次只允许使用一个线程执行Python字节码。因此，一个Python进程通常不能同时使用多个CPU核心。然而，标准库中所有执行阻塞型I/O操作的函数，在等待操作系统返回结果时都会释放GIL。这意味着在Python语言这个层次上可以使用多线程，而I/O密集型Python程序能从中受益：一个Python线程等待网络响应时，阻塞型I/O函数会释放GIL，再运行一个线程。

这个模块实现的是真正的并行计算，因为它使用ProcessPoolExecutor类把工作分配给多个Python进程处理。

ProcessPoolExecutor和ThreadPoolExecutor类都实现了通用的Executor接口，因此使用concurrent.futures模块能特别轻松地把基于线程的方案转成基于进程的方案。

ThreadPoolExecutor.__init__方法需要max_workers参数，指定线程池中线程的数量。在ProcessPoolExecutor类中，那个参数是可选的，而且大多数情况下不使用——默认值是os.cpu_count（　）函数返回的CPU数量。这样处理说得通，因为对CPU密集型的处理来说，不可能要求使用超过CPU数量的职程。而对I/O密集型处理来说，可以在一个ThreadPoolExecutor实例中使用10个、100个或1000个线程；最佳线程数取决于做的是什么事，以及可用内存有多少，因此要仔细测试才能找到最佳的线程数。

# ProcessPoolExecutor vs ThreadPoolExecutor

In [5]:
"""RC4 compatible algorithm"""

def arcfour(key, in_bytes, loops=20):

    kbox = bytearray(256)  # create key box
    for i, car in enumerate(key):  # copy key and vector
        kbox[i] = car
    j = len(key)
    for i in range(j, 256):  # repeat until full
        kbox[i] = kbox[i-j]

    # [1] initialize sbox
    sbox = bytearray(range(256))
    # repeat sbox mixing loop, as recommened in CipherSaber-2
    # http://ciphersaber.gurus.com/faq.html#cs2
    j = 0
    for k in range(loops):
        for i in range(256):
            j = (j + sbox[i] + kbox[i]) % 256
            sbox[i], sbox[j] = sbox[j], sbox[i]
    # main loop
    i = 0
    j = 0
    out_bytes = bytearray()

    for car in in_bytes:
        i = (i + 1) % 256
        # [2] shuffle sbox
        j = (j + sbox[i]) % 256
        sbox[i], sbox[j] = sbox[j], sbox[i]
        # [3] compute t
        t = (sbox[i] + sbox[j]) % 256
        k = sbox[t]
        car = car ^ k
        out_bytes.append(car)

    return out_bytes


def test():
    from time import time
    clear = bytearray(b'1234567890' * 100000)
    t0 = time()
    cipher = arcfour(b'lzk', clear)
    print('elapsed time: %.2fs' % (time() - t0))
    result = arcfour(b'lzk', cipher)
    assert result == clear, '%r != %r' % (result, clear)
    print('elapsed time: %.2fs' % (time() - t0))
    print('OK')


if __name__ == '__main__':
    test()


elapsed time: 0.46s
elapsed time: 0.90s
OK


In [4]:
key = b'key'
kbox = bytearray(256)  # create key box
for i, car in enumerate(key):  # copy key and vector
    print(i)
    print(car)
    kbox[i] = car

print(key[1])


0
107
1
101
2
121
101


In [3]:
'''
    arcfour futures
'''
import sys
import time
from concurrent import futures
from random import randrange
from arcfour import *

JOBS = 12
SIZE = 2**18

KEY = b"'Twas brillig, and the slithy toves\nDid gyre"
STATUS = '{} workers, elapsed time: {:.2f}s'



def main(workers=None):
    if workers:
        workers = int(workers)
    t0 = time.time()

    with futures.ProcessPoolExecutor(workers) as executor:
        actual_workers = executor._max_workers
        to_do = []
        for i in range(JOBS, 0, -1):
            size = SIZE + int(SIZE / JOBS * (i - JOBS/2))
            job = executor.submit(arcfour_test, size, KEY)
            to_do.append(job)

        for future in futures.as_completed(to_do):
            res = future.result()
            print('{:.1f} KB'.format(res/2**10))

    print(STATUS.format(actual_workers, time.time() - t0))


if __name__ == '__main__':
    if len(sys.argv) == 2:
        workers = int(sys.argv[1])
    else:
        workers = None
    main()



NameError: name 'randrange' is not defined

In [2]:
'''
    sha
'''
import sys
import time
import hashlib
from concurrent import futures
from random import randrange

JOBS = 12
SIZE = 2**20
STATUS = '{} workers, elapsed time: {:.2f}s'


def sha(size):
    data = bytearray(randrange(256) for i in range(size))
    algo = hashlib.new('sha256')
    algo.update(data)
    return algo.hexdigest()


def main(workers=None):
    if workers:
        workers = int(workers)
    t0 = time.time()

    with futures.ProcessPoolExecutor(workers) as executor:
        actual_workers = executor._max_workers
        to_do = (executor.submit(sha, SIZE) for i in range(JOBS))
        for future in futures.as_completed(to_do):
            res = future.result()
            print(res)

    print(STATUS.format(actual_workers, time.time() - t0))

if __name__ == '__main__':
    if len(sys.argv) == 2:
        workers = int(sys.argv[1])
    else:
        workers = None
    main(workers)

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

关于这个问题，从网上查了下，找到的解决方案似乎不起作用：

[Multiprocessing in Python on Windows and Jupyter/Ipython — Making it work](https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac)

#  实验Executor.map方法

In [1]:
from time import sleep, strftime
from concurrent import futures

def display(*args):
    print(strftime('[%H:%M:%S]'), end = ' ')
    print(*args)

def loiter(n):
    msg = '{}loiter({}) : doing nothing for {}s'
    display(msg.format('\t'*n, n, n))
    sleep(n)
    msg = '{}loiter({}) : done'
    display(msg.format('\t'*n, n))
    return n * 10

def main():
    display('Script Starting.')
    executor = futures.ThreadPoolExecutor(max_workers=3)
    # 这一行表明，executor.map方法返回的结果（results）是生成器；不管有多少任务，也不管max_workers的值是多少，目前不会阻塞。
    results = executor.map(loiter, range(5)) 
    display('results:', results)
    display('Waiting for individual results:')
    for i, result in enumerate(results):
        display('result {} : {}'.format(i, result))

main()


[13:56:44] Script Starting.
[13:56:44] loiter(0) : doing nothing for 0s
[13:56:44] 	loiter(1) : doing nothing for 1s
[13:56:44] loiter(0) : done
[13:56:44] 		loiter(2) : doing nothing for 2s
[13:56:44] results: <generator object Executor.map.<locals>.result_iterator at 0x7fa6b8c6b7b0>
[13:56:44] Waiting for individual results:
[13:56:44] result 0 : 0
[13:56:44] 			loiter(3) : doing nothing for 3s
[13:56:45] 	loiter(1) : done
[13:56:45] 				loiter(4) : doing nothing for 4s
[13:56:45] result 1 : 10
[13:56:46] 		loiter(2) : done
[13:56:46] result 2 : 20
[13:56:47] 			loiter(3) : done
[13:56:47] result 3 : 30
[13:56:49] 				loiter(4) : done
[13:56:49] result 4 : 40


Executor.map函数易于使用，不过有个特性可能有用，也可能没用，具体情况取决于需求：这个函数返回结果的顺序与调用开始的顺序一致。如果第一个调用生成结果用时10秒，而其他调用只用1秒，代码会阻塞10秒，获取map方法返回的生成器产出的第一个结果。在此之后，获取后续结果时不会阻塞，因为后续的调用已经结束。如果必须等到获取所有结果后再处理，这种行为没问题；不过，通常更可取的方式是，不管提交的顺序，只要有结果就获取。

executor.submit和futures.as_completed这个组合比executor.map更灵活，因为submit方法能处理不同的可调用对象和参数，而executor.map只能处理参数不同的同一个可调用对象。

executor.submit和futures.as_completed这个组合比executor.map更灵活，因为submit方法能处理不同的可调用对象和参数，而executor.map只能处理参数不同的同一个可调用对象。此外，传给futures.as_completed函数的future集合可以来自多个Executor实例，例如一些由ThreadPoolExecutor实例创建，另一些由ProcessPoolExecutor实例创建。

# 17.5 显示下载进度并处理错误

In [27]:
"""Utilities for second set of flag examples.
"""

import os
import time
import sys
import string
import argparse
from collections import namedtuple
from enum import Enum


Result = namedtuple('Result', 'status data')

HTTPStatus = Enum('Status', 'ok not_found error')

POP20_CC = ('AA CN IN US ID BR PK NG BD RU JP NN'
            'MX PH VN ET EG DE IR TR CD FR').split()

DEFAULT_CONCUR_REQ = 1
MAX_CONCUR_REQ = 1

SERVERS = {
    'REMOTE': 'http://flupy.org/data/flags',
    'LOCAL':  'http://localhost:8001/flags',
    'DELAY':  'http://localhost:8002/flags',
    'ERROR':  'http://localhost:8003/flags',
}
DEFAULT_SERVER = 'LOCAL'

DEST_DIR = 'downloads/'
COUNTRY_CODES_FILE = 'country_codes.txt'


def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)


def initial_report(cc_list, actual_req, server_label):
    if len(cc_list) <= 10:
        cc_msg = ', '.join(cc_list)
    else:
        cc_msg = 'from {} to {}'.format(cc_list[0], cc_list[-1])
    print('{} site: {}'.format(server_label, SERVERS[server_label]))
    msg = 'Searching for {} flag{}: {}'
    plural = 's' if len(cc_list) != 1 else ''
    print(msg.format(len(cc_list), plural, cc_msg))
    plural = 's' if actual_req != 1 else ''
    msg = '{} concurrent connection{} will be used.'
    print(msg.format(actual_req, plural))


def final_report(cc_list, counter, start_time):
    elapsed = time.time() - start_time
    print('-' * 20)
    msg = '{} flag{} downloaded.'
    plural = 's' if counter[HTTPStatus.ok] != 1 else ''
    print(msg.format(counter[HTTPStatus.ok], plural))
    if counter[HTTPStatus.not_found]:
        print(counter[HTTPStatus.not_found], 'not found.')
    if counter[HTTPStatus.error]:
        plural = 's' if counter[HTTPStatus.error] != 1 else ''
        print('{} error{}.'.format(counter[HTTPStatus.error], plural))
    print('Elapsed time: {:.2f}s'.format(elapsed))


def expand_cc_args(every_cc, all_cc, cc_args, limit):
    codes = set()
    A_Z = string.ascii_uppercase
    if every_cc:
        codes.update(a+b for a in A_Z for b in A_Z)
    elif all_cc:
        with open(COUNTRY_CODES_FILE) as fp:
            text = fp.read()
        codes.update(text.split())
    else:
        for cc in (c.upper() for c in cc_args):
            if len(cc) == 1 and cc in A_Z:
                codes.update(cc+c for c in A_Z)
            elif len(cc) == 2 and all(c in A_Z for c in cc):
                codes.add(cc)
            else:
                msg = 'each CC argument must be A to Z or AA to ZZ.'
                raise ValueError('*** Usage error: '+msg)
    return sorted(codes)[:limit]


def process_args(default_concur_req):
    server_options = ', '.join(sorted(SERVERS))
    parser = argparse.ArgumentParser(
                description='Download flags for country codes. '
                'Default: top 20 countries by population.')
    parser.add_argument('cc', metavar='CC', nargs='*',
                help='country code or 1st letter (eg. B for BA...BZ)')
    parser.add_argument('-a', '--all', action='store_true',
                help='get all available flags (AD to ZW)')
    parser.add_argument('-e', '--every', action='store_true',
                help='get flags for every possible code (AA...ZZ)')
    parser.add_argument('-l', '--limit', metavar='N', type=int,
                help='limit to N first codes', default=sys.maxsize)
    parser.add_argument('-m', '--max_req', metavar='CONCURRENT', type=int,
                default=default_concur_req,
                help='maximum concurrent requests (default={})'
                      .format(default_concur_req))
    parser.add_argument('-s', '--server', metavar='LABEL',
                default=DEFAULT_SERVER,
                help='Server to hit; one of {} (default={})'
                      .format(server_options, DEFAULT_SERVER))
    parser.add_argument('-v', '--verbose', action='store_true',
                help='output detailed progress info')
    args = parser.parse_args()
    if args.max_req < 1:
        print('*** Usage error: --max_req CONCURRENT must be >= 1')
        parser.print_usage()
        sys.exit(1)
    if args.limit < 1:
        print('*** Usage error: --limit N must be >= 1')
        parser.print_usage()
        sys.exit(1)
    args.server = args.server.upper()
    if args.server not in SERVERS:
        print('*** Usage error: --server LABEL must be one of',
              server_options)
        parser.print_usage()
        sys.exit(1)
    try:
        cc_list = expand_cc_args(args.every, args.all, args.cc, args.limit)
    except ValueError as exc:
        print(exc.args[0])
        parser.print_usage()
        sys.exit(1)

    if not cc_list:
        cc_list = sorted(POP20_CC)
    return args, cc_list


def main(download_many, default_concur_req, max_concur_req):
    # args = 
    cc_list = sorted(POP20_CC)
    actual_req = min(5, max_concur_req, len(cc_list))
    initial_report(cc_list, actual_req, 'REMOTE')
    base_url = SERVERS['REMOTE']
    t0 = time.time()
    counter = download_many(cc_list, base_url, False, actual_req)
    assert sum(counter.values()) == len(cc_list), \
        'some downloads are unaccounted for'
    final_report(cc_list, counter, t0)


## 实现依序下载

In [28]:
"""Download flags of countries (with error handling).

Sequential version

Sample run::

    $ python3 flags2_sequential.py -s DELAY b
    DELAY site: http://localhost:8002/flags
    Searching for 26 flags: from BA to BZ
    1 concurrent connection will be used.
    --------------------
    17 flags downloaded.
    9 not found.
    Elapsed time: 13.36s

"""

import collections

import requests
import tqdm



DEFAULT_CONCUR_REQ = 1
MAX_CONCUR_REQ = 1

# BEGIN FLAGS2_BASIC_HTTP_FUNCTIONS
def get_flag(base_url, cc):
    url = '{}/{cc}/{cc}.gif'.format(base_url, cc=cc.lower())
    resp = requests.get(url)
    if resp.status_code != 200:  # get_flag函数没有处理错误，当HTTP代码不是200(success)时，使用requests.Response.raise_for_status方法抛出异常。
        resp.raise_for_status()
    return resp.content


def download_one(cc, base_url, verbose=False):
    try:
        image = get_flag(base_url, cc)
    except requests.exceptions.HTTPError as exc:  # download_one函数捕获requests.exceptions.HTTPError异常，特别处理HTTP 404错误……
        res = exc.response
        if res.status_code == 404:
            status = HTTPStatus.not_found  #  ……方法是，把局部变量status设为HTTPStatus.not_found；HTTPStatus是从fags2_common模块（见示例A-10）中导入的Enum对象
            msg = 'not found'
        else:  # 重新抛出其他HTTPError异常；这些异常会向上冒泡，传给调用方
            raise
    else:
        save_flag(image, cc.lower() + '.gif')
        status = HTTPStatus.ok
        msg = 'OK'

    if verbose:  # 如果在命令行中设定了-v/--verbose选项，显示国家代码和状态消息；这就是详细模式中看到的进度信息。
        print(cc, msg)

    return Result(status, cc)  # download_one函数的返回值是一个namedtuple——Result，其中有个status字段，其值是HTTPStatus.not_found或HTTPStatus.ok。
# END FLAGS2_BASIC_HTTP_FUNCTIONS

# BEGIN FLAGS2_DOWNLOAD_MANY_SEQUENTIAL 实现依序下载的download_many函数
def download_many(cc_list, base_url, verbose, max_req):
    counter = collections.Counter()  # 这个Counter实例用于统计不同的下载状态：HTTPStatus.ok、HTTPStatus.not_found或HTTPStatus.error。
    cc_iter = sorted(cc_list)  # 按字母顺序传入的国家代码列表，保存在cc_iter变量中。
    if not verbose:
        cc_iter = tqdm.tqdm(cc_iter)  # 如果不是详细模式，把cc_iter传给tqdm函数，返回一个迭代器，产出cc_iter中的元素，还会显示进度条动画。

    for cc in cc_iter:  # 这个for循环迭代cc_iter……
        try:
            res = download_one(cc, base_url, verbose)  # ……不断调用download_one函数，执行下载。
        except requests.exceptions.HTTPError as exc:  # 处理get_fag函数抛出的与HTTP有关的且download_one函数没有处理的异常。
            error_msg = 'HTTP error {res.status_code} - {res.reason}'
            error_msg = error_msg.format(res=exc.response)
        except requests.exceptions.ConnectionError as exc:  # 处理其他与网络有关的异常。其他异常会中止这个脚本，因为调用download_many函数的fags2_common.main函数中没有try/except块。
            error_msg = 'Connection error'
        else:  # 如果没有异常从download_one函数中逃出，从download_one函数返回的namedtuple（HTTPStatus）中获取status。
            error_msg = ''
            status = res.status

        if error_msg:
            status = HTTPStatus.error  #  如果有错误，把局部变量status设为相应的状态。
        counter[status] += 1  #  如果有错误，把局部变量status设为相应的状态。
        if verbose and error_msg: # 如果是详细模式，而且有错误，显示带有当前国家代码的错误消息。
            print('*** Error for {}: {}'.format(cc, error_msg))

    return counter  # 返回counter，以便main函数能在最终的报告中显示数量。
# END FLAGS2_DOWNLOAD_MANY_SEQUENTIAL

if __name__ == '__main__':
    main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)


REMOTE site: http://flupy.org/data/flags
Searching for 21 flags: from AA to VN
1 concurrent connection will be used.


  0%|          | 0/21 [00:00<?, ?it/s]

<class 'tqdm.std.tqdm'>


100%|██████████| 21/21 [00:24<00:00,  1.15s/it]

--------------------
19 flags downloaded.
2 not found.
Elapsed time: 24.07s





##  使用futures.as_completed函数

In [30]:
"""Download flags of countries (with error handling).

ThreadPool version

Sample run::

    $ python3 flags2_threadpool.py -s ERROR -e
    ERROR site: http://localhost:8003/flags
    Searching for 676 flags: from AA to ZZ
    30 concurrent connections will be used.
    --------------------
    150 flags downloaded.
    361 not found.
    165 errors.
    Elapsed time: 7.46s

"""

# BEGIN FLAGS2_THREADPOOL
import collections
from concurrent import futures

import requests
import tqdm  # <1>

# from flags2_common import main, HTTPStatus  # <2>
# from flags2_sequential import download_one  # <3>

DEFAULT_CONCUR_REQ = 30  # 如果没有在命令行中指定-m/--max_req选项，使用这个值作为并发请求数的最大值，也就是线程池的大小；真实的数量可能会比这少，例如下载的国旗数量较少。
MAX_CONCUR_REQ = 1000  # <5>


def download_many(cc_list, base_url, verbose, concur_req):
    counter = collections.Counter()
    with futures.ThreadPoolExecutor(max_workers=concur_req) as executor:  # 把max_workers设为concur_req，创建ThreadPoolExecutor实例； main函数会把下面这三个值中最小的那个赋值给concur_req：MAX_CONCUR_REQ、cc_list的长度、-m/--max_req命令行选项的值。这样能避免创建超过所需的线程。
        to_do_map = {}  # 这个字典把各个Future实例（表示一次下载）映射到相应的国家代码上，在处理错误时使用。
        for cc in sorted(cc_list):  # 按字母顺序迭代国家代码列表。结果的顺序主要由HTTP响应的时间长短决定，不过，如果线程池的大小（由concur_req设定）比len(cc_list)小得多，可能会发现有按字母顺序批量下载的情况。
            future = executor.submit(download_one,
                            cc, base_url, verbose)  # 每次调用executor.submit方法排定一个可调用对象的执行时间，然后返回一个Future实例。第一个参数是可调用的对象，其余的参数是传给可调用对象的参数。
            to_do_map[future] = cc  # 把返回的future和国家代码存储在字典中。
        done_iter = futures.as_completed(to_do_map)  #  futures.as_completed函数返回一个迭代器，在future运行结束后产出future。
        if not verbose:
            done_iter = tqdm.tqdm(done_iter, total=len(cc_list))  # 如果不是详细模式，把as_completed函数返回的结果传给tqdm函数，显示进度条；因为done_iter没有len函数，所以我们必须通过total=参数告诉tqdm函数预期的元素数量，这样tqdm才能预计剩余的工作量。
        for future in done_iter:  # 迭代运行结束后的future。
            try:
                res = future.result()  # 在future上调用result方法，要么返回可调用对象的返回值，要么抛出可调用的对象在执行过程中捕获的异常。这个方法可能会阻塞，等待确定结果；不过，在这个示例中不会阻塞，因为as_completed函数只返回已经运行结束的future。
            except requests.exceptions.HTTPError as exc:  # 处理可能出现的异常；这个函数余下的代码与依序下载版download_many函数一样（见示例17-13），不过下一点除外。
                error_msg = 'HTTP {res.status_code} - {res.reason}'
                error_msg = error_msg.format(res=exc.response)
            except requests.exceptions.ConnectionError as exc:
                error_msg = 'Connection error'
            else:
                error_msg = ''
                status = res.status

            if error_msg:
                status = HTTPStatus.error
            counter[status] += 1
            if verbose and error_msg:
                cc = to_do_map[future]  # 为了给错误消息提供上下文，以当前的future为键，从to_do_map中获取国家代码。在依序下载版中无须这么做，因为那一版迭代的是国家代码，所以知道当前国家的代码；而这里迭代的是future。
                print('*** Error for {}: {}'.format(cc, error_msg))

    return counter


if __name__ == '__main__':
    main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)
# END FLAGS2_THREADPOOL


REMOTE site: http://flupy.org/data/flags
Searching for 21 flags: from AA to VN
5 concurrent connections will be used.


100%|██████████| 21/21 [00:07<00:00,  2.77it/s]

--------------------
19 flags downloaded.
2 not found.
Elapsed time: 7.62s





## 线程和多进程的替代方案

In [None]:
"""Download flags of countries (with error handling).

asyncio async/await version

"""
# BEGIN FLAGS2_ASYNCIO_TOP
import asyncio
import collections

import aiohttp
from aiohttp import web
import tqdm



# default set low to avoid errors from remote site, such as
# 503 - Service Temporarily Unavailable
DEFAULT_CONCUR_REQ = 5
MAX_CONCUR_REQ = 1000


class FetchError(Exception):  # <1>
    def __init__(self, country_code):
        self.country_code = country_code


async def get_flag(session, base_url, cc): # <2>
    url = '{}/{cc}/{cc}.gif'.format(base_url, cc=cc.lower())
    async with session.get(url) as resp:
        if resp.status == 200:
            return await resp.read()
        elif resp.status == 404:
            raise web.HTTPNotFound()
        else:
            raise aiohttp.HttpProcessingError(
                code=resp.status, message=resp.reason,
                headers=resp.headers)


async def download_one(session, cc, base_url, semaphore, verbose):  # <3>
    try:
        async with semaphore:  # <4>
            image = await get_flag(session, base_url, cc)  # <5>
    except web.HTTPNotFound:  # <6>
        status = HTTPStatus.not_found
        msg = 'not found'
    except Exception as exc:
        raise FetchError(cc) from exc  # <7>
    else:
        save_flag(image, cc.lower() + '.gif')  # <8>
        status = HTTPStatus.ok
        msg = 'OK'

    if verbose and msg:
        print(cc, msg)

    return Result(status, cc)
# END FLAGS2_ASYNCIO_TOP

# BEGIN FLAGS2_ASYNCIO_DOWNLOAD_MANY
async def downloader_coro(cc_list, base_url, verbose, concur_req):  # <1>
    counter = collections.Counter()
    semaphore = asyncio.Semaphore(concur_req)  # <2>
    async with aiohttp.ClientSession() as session:  # <8>
        to_do = [download_one(session, cc, base_url, semaphore, verbose)
                for cc in sorted(cc_list)]  # <3>

        to_do_iter = asyncio.as_completed(to_do)  # <4>
        if not verbose:
            to_do_iter = tqdm.tqdm(to_do_iter, total=len(cc_list))  # <5>
        for future in to_do_iter:  # <6>
            try:
                res = await future  # <7>
            except FetchError as exc:  # <8>
                country_code = exc.country_code  # <9>
                try:
                    error_msg = exc.__cause__.args[0]  # <10>
                except IndexError:
                    error_msg = exc.__cause__.__class__.__name__  # <11>
                if verbose and error_msg:
                    msg = '*** Error for {}: {}'
                    print(msg.format(country_code, error_msg))
                status = HTTPStatus.error
            else:
                status = res.status

            counter[status] += 1  # <12>

    return counter  # <13>


def download_many(cc_list, base_url, verbose, concur_req):
    loop = asyncio.get_event_loop()
    coro = downloader_coro(cc_list, base_url, verbose, concur_req)
    counts = loop.run_until_complete(coro)  # <14>
    loop.close()  # <15>

    return counts


if __name__ == '__main__':
    main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)
# END FLAGS2_ASYNCIO_DOWNLOAD_MANY


对CPU密集型工作来说，要启动多个进程，规避GIL。创建多个进程最简单的方式是，使用futures.ProcessPoolExecutor类。不过和前面一样，如果使用场景较复杂，需要更高级的工具。multiprocessing模块的API与threading模块相仿，不过作业交给多个进程处理。对简单的程序来说，可以用multiprocessing模块代替threading模块，少量改动即可。不过，multiprocessing模块还能解决协作进程遇到的最大挑战：在进程之间传递数据。