# 最简单的爬取方式

Derivative from [Morvan Python-爬虫](https://github.com/MorvanZhou/easy-scraping-tutorial/tree/master/notebook)

In [10]:
from urllib.request import urlopen

# 如果有中文的话用.decode解码
html = urlopen(
    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
    ).read().decode('utf-8')

print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


#### 用正则表达式选取文本信息

In [5]:
import re
res = re.findall(r'<title>(.+?)</title>',html)
print("\nPage title is: ",res[0])


Page title is:  Scraping tutorial 1 | 莫烦Python


In [8]:
res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL)    # re.DOTALL if multi line
print("\nPage paragraph is: ", res[0])


Page paragraph is:  
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	


In [9]:
# 抓取链接
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)


All links:  ['https://morvanzhou.github.io/static/img/description/tab_icon.png', 'https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/data-manipulation/scraping/']


# BeautifulSoup

高级匹配

中文官网 https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/ 

In [13]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [14]:
soup = BeautifulSoup(html, features='lxml') # lxml是一种解析形式
print(soup.h1)  # 选取tag h1
print('\n', soup.p)  # 选取tag p

<h1>爬虫测试1</h1>

 <p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>


In [15]:
all_href = soup.find_all('a') # 找出所有a的tag，这里a是tag
print(all_href)
all_href = [l['href'] for l in all_href] # 提取其中的网址部分，这里href是属性
print('\n', all_href)

[<a href="https://morvanzhou.github.io/">莫烦Python</a>, <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a>]

 ['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/data-manipulation/scraping/']


In [20]:
# 或者
all_href = soup.find_all('a')
for i in all_href:
    print(i['href'])

https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/


### Beautifulsoup: find by CSS class

In [24]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html").read().decode('utf-8')
print(html[:280])

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 列表 class | 莫烦 Python</title>
	<style>
	.jan {
		background-color: yellow;
	}
	.feb {
		font-size: 25px;
	}
	.month {
		color: red;
	}
	</style>
</head>

<body>

<h1>列表 爬虫练习</h1>

<p>这是一个在 <a href="https:


> 上述html中CSS代码部分为:

	<style\>
	.jan {
		background-color: yellow;
	}
	.feb {
		font-size: 25px;
	}
	.month {
		color: red;
	}
    </style\>

其中jan、feb、month为class

网页源代码

<img src="web_source_code.png">

In [30]:
soup = BeautifulSoup(html, features='lxml')

# use class to narrow search
month = soup.find_all('li', {"class": "month"}) 
# 由于html中tag为li的部分还包含了<li>一月一号</li>,所以需要用class选出实际所需要的部分
for m in month:
    print(m.get_text()) # 最后m实际为<li class="month">五月</li>，需要用.get_text()只提取文字部分

一月
二月
三月
四月
五月


In [37]:
# 第二个例子
jan = soup.find('ul', {"class": 'jan'})
print(jan,"\n")

d_jan = jan.find_all('li')  
print(d_jan,"\n")

for d in d_jan:
    print(d.get_text())

<ul class="jan">
<li>一月一号</li>
<li>一月二号</li>
<li>一月三号</li>
</ul> 

[<li>一月一号</li>, <li>一月二号</li>, <li>一月三号</li>] 

一月一号
一月二号
一月三号


### Beautifulsoup: find by regular expression（正则表达）

In [43]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
print(html[700:])

			Tensorflow 神经网络</a>
		</td><td>
			2:00
		</td><td>
			<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg">
		</td>
	</tr>

	<tr id="course2" class="ml">
		<td>
			机器学习
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/">
				强化学习</a>
		</td><td>
			5:00
		</td><td>
			<img src="https://morvanzhou.github.io/static/img/course_cover/rl.jpg">
		</td>
	</tr>

	<tr id="course3" class="data">
		<td>
			数据处理
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">
				爬虫</a>
		</td><td>
			3:00
		</td><td>
			<img src="https://morvanzhou.github.io/static/img/course_cover/scraping.jpg">
		</td>
	</tr>

</table>

</body>
</html>


In [39]:
# 提取jpg图片
soup = BeautifulSoup(html, features='lxml')

img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
    print(link['src'])

https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg


In [44]:
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
    print(link['href'])

https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/


## Practice: scrape Baidu Baike
- https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711
<img src="page.png">

In [45]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random

base_url = "https://baike.baidu.com"
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]

In [46]:
url = base_url + his[-1]
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
print(soup.find('h1').get_text(), '    url: ', his[-1])

网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711


<img src='p2.png'/>

#### Find all sub_urls（超链接） for baidu baike (item page), randomly select a sub_urls and store it in "his". If no valid sub link is found, than pop last url in "his".

In [49]:
# 随机选取一个超链接，如“万维网”
sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})
print(sub_urls)

if len(sub_urls) != 0:
    his.append(random.sample(sub_urls, 1)[0]['href'])  # 随机选取一个加到his里面
else:
    # no valid sub link found
    his.pop()
print(his)

[<a href="/item/%E4%B8%87%E7%BB%B4%E7%BD%91" target="_blank">万维网</a>, <a href="/item/%E7%BD%91%E7%BB%9C" target="_blank">网络</a>, <a href="/item/%E4%B8%87%E7%BB%B4%E7%BD%91" target="_blank">万维网</a>, <a href="/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E" target="_blank">搜索引擎</a>, <a href="/item/%E9%80%9A%E7%94%A8%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E" target="_blank">通用搜索引擎</a>, <a href="/item/%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE" target="_blank">网络数据</a>, <a href="/item/%E4%B8%87%E7%BB%B4%E7%BD%91" target="_blank">万维网</a>, <a href="/item/%E7%BD%91%E7%BB%9C%E6%8A%80%E6%9C%AF" target="_blank">网络技术</a>, <a href="/item/%E9%80%9A%E7%94%A8%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E" target="_blank">通用搜索引擎</a>, <a href="/item/%E5%85%B3%E9%94%AE%E5%AD%97" target="_blank">关键字</a>, <a href="/item/%E4%B8%87%E7%BB%B4%E7%BD%91" target="_blank">万维网</a>, <a href="/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E" target="_blank">搜索引擎</a>, <a href="/item/%E6%90%9C%E7%B4%A2%E7%AD%96%E7%95%A5" target="_blank">搜索策略</a>, <a

<img src='p3.png'/>

#### Put everthing together. Random running for 20 iterations. See what we end up with.

In [57]:
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]

for i in range(20):
    url = base_url + his[-1]  # 进入下一个超链接，循环....

    html = urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, features='lxml')
    print(i, soup.find('h1').get_text(), '    url: ', his[-1])

    # find valid urls
    sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})

    if len(sub_urls) != 0:
        his.append(random.sample(sub_urls, 1)[0]['href'])
    else:
        # no valid sub link found
        his.pop()

0 网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711
1 搜索引擎     url:  /item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E
2 全球信息网格     url:  /item/%E5%85%A8%E7%90%83%E4%BF%A1%E6%81%AF%E7%BD%91%E6%A0%BC
3 服务器     url:  /item/%E6%9C%8D%E5%8A%A1%E5%99%A8
4 大脑     url:  /item/%E5%A4%A7%E8%84%91
5 中央沟     url:  /item/%E4%B8%AD%E5%A4%AE%E6%B2%9F
6 额叶     url:  /item/%E9%A2%9D%E5%8F%B6
7 运动障碍     url:  /item/%E8%BF%90%E5%8A%A8%E9%9A%9C%E7%A2%8D
8 下运动神经元     url:  /item/%E4%B8%8B%E8%BF%90%E5%8A%A8%E7%A5%9E%E7%BB%8F%E5%85%83
9 腱反射     url:  /item/%E6%B7%B1%E5%8F%8D%E5%B0%84
10 前臂     url:  /item/%E5%89%8D%E8%87%82
11 肌肉组织     url:  /item/%E8%82%8C%E8%82%89%E7%BB%84%E7%BB%87
12 安德烈·海姆     url:  /item/%E6%B5%B7%E5%A7%86
13 英国皇家学会     url:  /item/%E7%9A%87%E5%AE%B6%E5%AD%A6%E4%BC%9A
14 西蒙·唐纳森     url:  /item/%E8%A5%BF%E8%92%99%C2%B7%E5%94%90%E7%BA%B3%E6%A3%AE
15 伦敦     url:  /item/%E4%BC%A6%E6%95%A6
16 泰晤士河     url:  /item/%E6%B3%B0%E6%99%A4%E5%A3%AB%E6%B2%B3
17 地中海     url:  /item/%E5%9C%B0

## requests: an alternative to urllib

在加载网页的时候, 有几种类型, 而这几种类型就是打开网页的关键. 最重要的类型 (method) 就是 get 和 post (当然还有其他的, 比如 head, delete).

#### post
- 账号登录
- 搜索内容
- 上传图片
- 上传文件
- 往服务器传数据

#### get
- 正常打开网页
- 不往服务器传数据


In [59]:
import requests
import webbrowser
param = {"wd": "Python爬虫"}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
webbrowser.open(r.url) # 在浏览器端打开这个网页

http://www.baidu.com/s?wd=Python%E7%88%AC%E8%99%AB


True

<img src='p4.png'/>

## post

In [65]:
data = {'firstname': 'Mengyu', 'lastname': 'Ge'}
r = requests.post('http://pythonscraping.com/pages/files/processing.php', data=data)
print(r.text)

Hello there, Mengyu Ge!


<img src="p5.png" style="width:400px;height:200px;float:right">

<img src="p6.png" style="width:400px;height:150px;float:left">

<img src="p7.png" />

### Upload image

We still use post function to update image in this [page](http://pythonscraping.com/files/form2.html).

In [72]:
file = {'uploadFile': open('p2.png', 'rb')}
r = requests.post('http://pythonscraping.com/files/processing2.php', files=file)
print(r.text)

uploads/
Sorry, there was an error uploading your file.


## login
Use post method to login to a [website](http://pythonscraping.com/pages/cookies/login.html)

这个网有点炸，所以有的情况下不会正常工作

In [76]:
payload = {'username': 'Mengyu', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', cookies=r.cookies)
# ↑第二次登陆就直接用Cookie
# print(r.text)

{'loggedin': '1', 'username': 'Mengyu'}


## another general way to login

Use session instead requests. Keep you in a session and keep track the cookies.

（用一系列对话来控制cookies, requests.Session()）

In [78]:
session = requests.Session()
payload = {'username': 'Mengyu', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
# print(r.text)

{'loggedin': '1', 'username': 'Mengyu'}


## Download something

In [79]:
# 下载图片
import os
os.makedirs('./img/', exist_ok=True)

IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"

#### 使用urlretrieve下载图片

In [80]:
from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png')      # whole document

('./img/image1.png', <http.client.HTTPMessage at 0x265363ad828>)

#### 使用requests下载图片

In [81]:
import requests
r = requests.get(IMAGE_URL)
with open('./img/image2.png', 'wb') as f:
    f.write(r.content)                      # whole document

上述两种方法都是先下载到内存里面，再存到本地，不适合下载大批量文件

#### download chunck by chunck

Set stream = True in get() function. 这样更高效

In [82]:
r = requests.get(IMAGE_URL, stream=True)    # stream loading

with open('./img/image3.png', 'wb') as f:
    for chunk in r.iter_content(chunk_size=32):
        f.write(chunk)

## Download images from web

Download amazing pictures from [national geographic](http://www.ngchina.com.cn/animals/)

In [105]:
from bs4 import BeautifulSoup
import requests

URL = "http://www.ngchina.com.cn/animals/"

#### find list of image holder

<img src='p9.png'/>

In [106]:
html = requests.get(URL).text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('ul', {"class": "img_list"})

import os
os.makedirs('./img2/', exist_ok=True)

for ul in img_ul:
    imgs = ul.find_all('img')
    for img in imgs:
        url = img['src']
        r = requests.get(url, stream=True)
        image_name = url.split('/')[-1]
        with open('./img2/%s' % image_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)

## Distributed scraping: multiprocessing

In [110]:
import multiprocessing as mp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re

base_url = 'https://morvanzhou.github.io/'

# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN
if base_url != "http://127.0.0.1:4000/":
    restricted_crawl = True
else:
    restricted_crawl = False

In [109]:
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)             # slightly delay for downloading
    return response.read().decode()

In [111]:
def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    url = soup.find('meta', {'property': "og:url"})['content']
    return title, page_urls, url

#### 正常不使用multiprocessing

In [112]:
unseen = set([base_url,])
seen = set()

count, t1 = 1, time.time()

while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
        
    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]

    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    


Distributed Crawling...

Distributed Parsing...

Analysing...
1 教程 https://morvanzhou.github.io/

Distributed Crawling...

Distributed Parsing...

Analysing...
2 进化算法 Evolutionary Strategies 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/evolutionary-algorithm/
3 迁移学习 Transfer Learning https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/5-16-transfer-learning/
4 机器学习实践 https://morvanzhou.github.io/tutorials/machine-learning/ML-practice/
5 基础教程系列 https://morvanzhou.github.io/tutorials/python-basic/basic/
6 Tkinter GUI 教程系列 https://morvanzhou.github.io/tutorials/python-basic/tkinter/
7 multiprocessing 多进程教程系列 https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/
8 Numpy & Pandas 教程系列 https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/
9 Pytorch 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/torch/
10 关于莫烦 https://morvanzhou.github.io/about/
11 数据处理教程系列 https://morvanzhou.github.io/tutorials/data-manipulation/
12 其他教学系

#### multiprocessing

In [None]:
unseen = set([base_url,])
seen = set()

pool = mp.Pool(4)                       
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]  # +++++++++
    htmls = [j.get() for j in crawl_jobs]                                       # request connection

    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]  # +++++++++
    results = [j.get() for j in parse_jobs]                                     # parse html

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    

### 异步加载

#### 不用异步

In [1]:
import time

def job(t):
    print('Start job ', t)
    time.sleep(t)               # wait for "t" seconds
    print('Job ', t, ' takes ', t, ' s')
    
def main():
    [job(t) for t in range(1, 3)]

t1 = time.time()
main()
print("NO async total time : ", time.time() - t1)

Start job  1
Job  1  takes  1  s
Start job  2
Job  2  takes  2  s
NO async total time :  3.0010645389556885


#### 使用异步
[参考](https://github.com/MorvanZhou/easy-scraping-tutorial/blob/master/notebook/4-2-asyncio.ipynb)

In [None]:
import asyncio

async def job(t):
    print('Start job ', t)
    await asyncio.sleep(t)          # wait for "t" seconds, it will look for another job while await
    print('Job ', t, ' takes ', t, ' s')

async def main(loop):
    tasks = [loop.create_task(job(t)) for t in range(1, 3)]     # just create, not run job
    await asyncio.wait(tasks)                                   # run jobs and wait for all tasks done

t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))

# loop.close()                          # Ipython notebook gives error if close loop
print("Async total time : ", time.time() - t1)

结果：
<img src='p10.png'/>

## 高级爬虫

### Selenium tutorial
[https://github.com/MorvanZhou/easy-scraping-tutorial/blob/master/notebook/5-1-selenium.ipynb]

In [12]:
import os
os.makedirs('./img3/', exist_ok=True)

In [14]:
# http://selenium-python.readthedocs.io/installation.html 进入这个网址下载驱动
from selenium import webdriver

  return f(*args, **kwds)


至于下面这些代码是怎么写的，需要用一个火狐浏览器插件，[下载网址](https://addons.mozilla.org/en-US/firefox/addon/katalon-automation-record/)，这个插件需要先人为模拟点击，然后记录，生成python文件

In [20]:
# windows用户指导：https://www.cnblogs.com/yqpy/p/8306066.html

driver = webdriver.Chrome()
driver.get("https://morvanzhou.github.io/")
driver.find_element_by_xpath(u"//img[@alt='强化学习 (Reinforcement Learning)']").click()
driver.find_element_by_link_text("About").click()
driver.find_element_by_link_text(u"赞助").click()
driver.find_element_by_link_text(u"教程 ▾").click()
driver.find_element_by_link_text(u"数据处理 ▾").click()
driver.find_element_by_link_text(u"网页爬虫").click()

html = driver.page_source       # get html
driver.get_screenshot_as_file("./img/sreenshot1.png")
driver.close()
print(html[:200])

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="zh-CN en"><head>
  <meta charset="UTF-8" />
  <meta name="vie


#### 如果不想让网页真正展示出来

In [21]:
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")       # define headless

# add the option when creating driver
driver = webdriver.Chrome(chrome_options=chrome_options)    
driver.get("https://morvanzhou.github.io/")
driver.find_element_by_xpath(u"//img[@alt='强化学习 (Reinforcement Learning)']").click()
driver.find_element_by_link_text("About").click()
driver.find_element_by_link_text(u"赞助").click()
driver.find_element_by_link_text(u"教程 ▾").click()
driver.find_element_by_link_text(u"数据处理 ▾").click()
driver.find_element_by_link_text(u"网页爬虫").click()

html = driver.page_source           # get html
driver.get_screenshot_as_file("./img/sreenshot2.png") # 确认真正到了这个地址
driver.close()
print(html[:200])

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" lang="zh-CN en"><head>
  <meta charset="UTF-8" />
  <meta name="vie


## Scrapy框架

[scrapy.org](https://scrapy.org/)

In [22]:
import scrapy


class MofanSpider(scrapy.Spider):
    name = "mofan"
    start_urls = [
        'https://morvanzhou.github.io/',
    ]
    # unseen = set()
    # seen = set()      # we don't need these two as scrapy will deal with them automatically

    def parse(self, response):
        yield {     # return some results
            'title': response.css('h1::text').extract_first(default='Missing').strip().replace('"', ""),
            'url': response.url,
        }

        urls = response.css('a::attr(href)').re(r'^/.+?/$')     # find all sub urls
        for url in urls:
            yield response.follow(url, callback=self.parse)     # it will filter duplication automatically

# 在terminal中输入 scrapy runspider 5-2-scrapy.py -o res.json