Skip to content

Commit

Permalink
对糗事百科进行了重构-协程池(高性能)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kr1s77 committed Mar 31, 2019
1 parent 7e34b2e commit 1c7d3fd
Showing 1 changed file with 78 additions and 83 deletions.
161 changes: 78 additions & 83 deletions qsbk/qiushibaike.py
@@ -1,88 +1,83 @@
#!/usr/bin/env python3.6
# coding=utf-8
# !/usr/bin/python3
# -*- coding: utf-8 -*-

# 1. 导入线程池模块
# 线程池
import gevent.monkey
gevent.monkey.patch_all()
from gevent.pool import Pool
from queue import Queue
import requests
from lxml import etree
import os
import sys

"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
update_time:2019-3-9
"""
page_init = "https://www.qiushibaike.com/text/"
joke_init = "https://www.qiushibaike.com/article/"
SAVE_PATH = os.path.join(os.getcwd(), 'jokes/')


class Spider(object):
def __init__(self, page_num):
self.page_num = int(page_num)
# 第一页特殊处理
self.page_urls = ["https://www.qiushibaike.com/text/"]
# page_url -> joke_url
self.joke_urls = []
# joke_url -> joke_name joke_content
# dict name : content
self.joke_content = ""
self.joke_id = 1;
try:
os.mkdir(SAVE_PATH)
except Exception as e:
print(e)

def get_page_urls(self):
if self.page_num > 1:
# 通过遍历获取到链接
for n in range(2, self.page_num + 1):
page_url = page_init + 'page/' + str(n) + '/'
self.page_urls.append(page_url)

def get_joke_urls(self):
for page_url in self.page_urls:
html = requests.get(page_url).content
selector = etree.HTML(html)
qiushi_id = selector.xpath('/html/body/div[@id="content"]/div/div[@id="content-left"]/div/@id')
for q_id in qiushi_id:
id = q_id.split('_')[2]
joke_url = joke_init + id + '/'
print(joke_url)
self.joke_urls.append(joke_url)

def get_joke(self):
for joke_url in self.joke_urls:
html = requests.get(joke_url).content
selector = etree.HTML(html)
one_joke = selector.xpath('//div[@class="word"]/div/text()')
self.joke_content = ""
for words in one_joke:
self.joke_content += words + '\n'
self.download()

def download(self):
joke_path = SAVE_PATH + str(self.joke_id) + '.txt'
self.joke_id += 1
# 笑话路径
print(joke_path)
with open(joke_path, "w") as f:
f.write(self.joke_content)

def start(self):
# 获取主页url
self.get_page_urls()
# 获取笑话链接
self.get_joke_urls()
# 获取笑话
self.get_joke()
# 调用下载接口
self.download()

class QiushiSpider():

if __name__ == '__main__':
# 获取账号
page_num = input('请告诉我:你想获取多少页的糗事?')
def __init__(self, max_page):
self.max_page = max_page
# 2. 创建线程池,初始化线程数量
self.pool = Pool(5)

self.base_url = "http://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}

# 专门存放 url 容器
self.url_queue = Queue()
pass

def get_url_list(self):
'''
获取 url 列表放入到 url 容器中
:return:
'''
for page in range(1,self.max_page,1):
url = self.base_url.format(page)
self.url_queue.put(url)

# 3. 实现执行任务
def exec_task(self):
# 1> 获取url
url = self.url_queue.get()

# 2> 发送请求获取 html
response = requests.get(url,headers=self.headers)
html = response.text

# 3> 解析 html 提取数据
eroot = etree.HTML(html)

titles = eroot.xpath('//a[@class="recmd-content"]/text()')
for title in titles:
item = {}
item["title"] = title

qb = Spider(page_num)
# 启动爬虫程序
qb.start()
# 4> 保存数据
print(item)
self.url_queue.task_done()



# 4. 实现执行任务完成后的操作,必须至少有一个参数
# result 任务执行的最终结果的返回值
def exec_task_finished(self,result):
print("result:",result)
print("执行任务完成")
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished)


def run(self):

self.get_url_list()

# 5. 让任务使用线程池中的线程执行并且设置执行后的回调操作
# callback 表示执行完成后的回调
for i in range(5):
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished)
self.url_queue.join()
pass

if __name__ == '__main__':
max_page = input("请输入您需要多少页内容:")
spider = QiushiSpider(int(max_page))
spider.run()

0 comments on commit 1c7d3fd

Please sign in to comment.