-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
40 lines (33 loc) · 1.52 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from .spider_utils import UrlManger
from .spider_utils import HTMLDownloader
from .spider_utils import HTMLParser
from .spider_utils import DataOutput
class SpiderMan(object):
def __init__(self, root_url):
self.root_url = root_url
self.url_manager = UrlManger()
self.html_downloader = HTMLDownloader()
self.data_output = DataOutput()
def crawl(self):
# 将根入口添加到url管理器的待爬取集合中
self.url_manager.add_new_url(self.root_url)
while self.url_manager.old_urls_size != 100:
# 判断是否有待爬取的url
if self.url_manager.has_new_url():
# 获取一个未爬取的url链接
new_url = self.url_manager.get_new_url()
# 交给 HTML 下载器取访问这个链接, 拿到下载好的html字符串文本
html_text = self.html_downloader.download(new_url)
# 将 html_text 交给HTML解析器取提取数据,和包含的新链接
data, new_urls = HTMLParser.parser(new_url, html_text)
# 把数据交给数据存储器
self.data_output.store_data(data)
# 把链接添加到url管理器中的待爬取链接集合
self.url_manager.add_new_urls(new_urls)
self.data_output.output_json()
def main():
root_url = "https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB"
spider = SpiderMan(root_url)
spider.crawl()
if __name__ == "__main__":
main()