In [32]:
import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


base_url = "https://morvanzhou.github.io/"
# 限制爬取
if base_url != "http:/127.0.0.1:4000/":
    restricted_crawl = True
else:
    restricted_crawl = False

### Create a crawl function to open a url in parallel

In [33]:
#爬取功能
def crawl(url):
    response = urlopen(url)
    #slightly delay for downloading
    time.sleep(0.1)
    return response.read().decode()

###  Create a parse function (解析函数) to find all results we need in parallel

In [34]:
def parse(html):
    soup = BeautifulSoup(html,'lxml')
    urls = soup.find_all('a',{'href':re.compile('^/.+?/$')})
    #Python strip() 方法用于移除字符串头尾指定的字符（默认为空格）
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href']) for url in urls])
    # 爬取网页的url
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url
    

In [35]:
unseen = set([base_url,])
seen = set()

count, t1 = 1, time.time()

while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
        
    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]

    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 53 s


Distributed Crawling...

Distributed Parsing...

Analysing...
1 教程 https://morvanzhou.github.io/

Distributed Crawling...

Distributed Parsing...

Analysing...
2 说吧~ https://morvanzhou.github.io/discuss/
3 Pytorch 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/torch/
4 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
5 机器学习实践 https://morvanzhou.github.io/tutorials/machine-learning/ML-practice/
6 Why? https://morvanzhou.github.io/tutorials/data-manipulation/scraping/1-00-why/
7 进化算法 Evolutionary Strategies 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/evolutionary-algorithm/
8 基础教程系列 https://morvanzhou.github.io/tutorials/python-basic/basic/
9 强化学习 Reinforcement Learning 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
10 multiprocessing 多进程教程系列 https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/
11 Matplotlib 画图教程系列 https://morvanzhou.github.io/tutoria

In [36]:

unseen = set([base_url,])
seen = set()

# 几核
pool = mp.Pool(4)                       
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]                                       # request connection

    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]                                     # parse html

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 16 s !!!



Distributed Crawling...

Distributed Parsing...

Analysing...
1 教程 https://morvanzhou.github.io/

Distributed Crawling...

Distributed Parsing...

Analysing...
2 说吧~ https://morvanzhou.github.io/discuss/
3 Pytorch 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/torch/
4 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
5 机器学习实践 https://morvanzhou.github.io/tutorials/machine-learning/ML-practice/
6 Why? https://morvanzhou.github.io/tutorials/data-manipulation/scraping/1-00-why/
7 进化算法 Evolutionary Strategies 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/evolutionary-algorithm/
8 基础教程系列 https://morvanzhou.github.io/tutorials/python-basic/basic/
9 强化学习 Reinforcement Learning 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
10 multiprocessing 多进程教程系列 https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/
11 Matplotlib 画图教程系列 https://morvanzhou.github.io/tutoria