## python 创建多线程的方法
1. 准备一个函数 my_func
2. 创建一个线程
3. 启动线程
4. 等待结束

In [1]:
import threading 

def my_func(a, b):
    return a + b 

t = threading.Thread(target=my_func, args=(1,2))
t.start()
t.join()

**例子**  
爬取cnblogs数据

In [2]:
import blog_spider

def single_thread():
    print("single thread begin")
    for url in blog_spider.urls:
        blog_spider.craw(url)
    print("single thread end")


def multi_thread():
    print("multi thread begin")
    threads = []
    for url in blog_spider.urls:
        threads.append(
            threading.Thread(target=blog_spider.craw, args=(url,))
        )

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print("multi thread end")

In [8]:
%%capture out1
import time 

bg = time.time()
single_thread()
ed = time.time()
print(f"single thread cost: {round(ed-bg,3)}s")

bg = time.time()
multi_thread()
ed = time.time()
print(f"multi thread cost: {round(ed-bg,3)}s")

## 多组件pipeline技术架构 
生产者-消费者  
此时使用多线程数据通信的queue.Queue

In [8]:
%%capture out2

import queue
import blog_spider
import time 
import random
import threading 

def do_craw(url_queue:queue.Queue, html_queue:queue.Queue):
    while True:
        try:
            url = url_queue.get(timeout=10)
        except queue.Empty:
            break
        html = blog_spider.craw_text(url)
        html_queue.put(html)
        print(threading.current_thread().name, "do_craw", f"url_queue.size={url_queue.qsize()}")
        time.sleep(random.randint(1, 2))


def do_parse(html_queue, fout):
    while True:
        try:
            html = html_queue.get(timeout=10)
        except queue.Empty:
            break
        results = blog_spider.parse(html)
        for result in results:
            fout.write(str(result)+"\n")
        print(threading.current_thread().name, "parse", f"html_queue.size={html_queue.qsize()}")
        time.sleep(random.randint(1, 2))

url_queue = queue.Queue()
html_queue = queue.Queue()
for url in blog_spider.urls:
    url_queue.put(url)

threads = []

for i in range(3):
    t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{i}")
    t.start()
    threads.append(t)

fout = open("./data/1.txt", "w")
for i in range(2):
    t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{i}")
    t.start()
    threads.append(t)

for t in threads:
    t.join()
fout.close()

craw1 do_crawparse0craw0 parse do_craw url_queue.size=47
  url_queue.size=47html_queue.size=0

parse1 parse html_queue.size=0
craw2 do_craw url_queue.size=47
parse0 parse html_queue.size=0
craw1 do_craw url_queue.size=45
craw2 do_craw url_queue.size=45
parse1 parse html_queue.size=1
parse0 parse html_queue.size=0
craw0 do_craw url_queue.size=44
parse1 parse html_queue.size=0
craw1 do_craw url_queue.size=42
parse0 parse html_queue.size=0
craw2 do_craw url_queue.size=42
craw0 do_craw url_queue.size=41
parse0 parse html_queue.size=1
craw1 do_craw url_queue.size=40
parse1 parse html_queue.size=1
parse0 parse html_queue.size=0
craw2 do_craw url_queue.size=39
craw1 do_craw url_queue.size=38
craw0 do_craw url_queue.size=37
parse0 parse html_queue.size=2
craw2 do_craw url_queue.size=36
parse1 parse html_queue.size=2
craw1 do_craw url_queue.size=35
craw0parse0 parse html_queue.size=3
 do_craw url_queue.size=34
craw2 do_craw url_queue.size=33
parse1 parse html_queue.size=3
parse0 parse html_queu

## 线程安全

#### 概念
线程安全指某个函数、函数库在多线程环境中被调用时，能够正确地处理多个线程之间的共享变量，事程序功能正确完成  

由于线程的执行随时会发生切换，就造成了不可预料的结果，出现线程不安全  

#### Lock 用于解决线程安全问题
用法一：
```python
import threading

lock = threading.Lock()
lock.acquire()
try:
    do something
finally:
    lock.release()
```

用法二：
```python
lock = threading.lock()

with lock:
    do something
```

hint: 线程安全问题一般出现在if分支内，可以将整个条件判断代码块进行加锁

In [14]:
import threading
import time 
class Account:
    def __init__(self, balance):
        self.balance = balance

lock = threading.Lock()

def draw(account, amount):
    with lock:
        if account.balance >= amount:
            time.sleep(0.1)
            print(threading.current_thread().name, 
                "取钱成功")
            account.balance -= amount
            print(threading.current_thread().name, 
                f"余额：{account.balance}")
        else:
            print(threading.current_thread().name, 
                "取钱失败，余额不足")

def test01():
    account = Account(1000)
    ta = threading.Thread(target=draw, args=(account, 800), name="ta")
    tb = threading.Thread(target=draw, args=(account, 800), name="tb")

    ta.start()
    tb.start()
    ta.join()
    tb.join()

    print(account.balance)

test01()

ta 取钱成功
ta 余额：200
tb 取钱失败，余额不足
200


## 线程池

#### 线程池原理
线程生命周期：  
新建 --> 就绪  
　　　　　|  
　　　　　获得cpu资源    <-- sleep/io结束  
　　　　　|　　　　　　　　　　｜  
　　　　　运行 --sleep/io--> 阻塞(失去cpu资源)  
　　　　　|  
终止 <-- run方法执行玩  
- 新建线程系统需要分配资源、终止线程系统需要回收资源
- 如果可以重用线程，则可以减去新建/终止的开销

#### 线程池好处
- 适用场景：适合处理突发性大量请求或需要大量线程完成任务、但实际任务处理时间较短
- 防御功能：能有效避免系统因为创建线程过多，而导致系统负荷过大相应变慢等问题
- 代码优势：更加简洁


#### 线程池使用方式
用法一：map函数，map结果和argsLs顺序相对应
```python
from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecuter() as pool:
    results = pool.map(func, argsLs)

    for result in results:
    print(result)
```

用法而：future模式，更强大，如果使用as_completed模式则顺序是不定的
```python
with ThreadPoolExecuter() as pool:
    futures = [pool.submit(func, args)
               for args in argsLs]
               
    for future in futures:
        print(future.result())
    for future in as_completed(futures):
        print(future.result())
```



In [17]:
%%capture out3
from concurrent.futures import ThreadPoolExecutor, as_completed
import blog_spider

with ThreadPoolExecutor() as pool:
    htmls = pool.map(blog_spider.craw_text, blog_spider.urls)
    htmls = list(zip(blog_spider.urls, htmls))
    for u,h in htmls:
        print(u, len(h))

print("craw over")

with ThreadPoolExecutor() as pool:
    futures = {}
    for u,h in htmls:
        future = pool.submit(blog_spider.parse, h)
        futures[u] = future 
    
    for u,future in futures.items():
        print(u, future.result())