Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
78 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,88 +1,83 @@ | ||
#!/usr/bin/env python3.6 | ||
# coding=utf-8 | ||
# !/usr/bin/python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
# 1. 导入线程池模块 | ||
# 线程池 | ||
import gevent.monkey | ||
gevent.monkey.patch_all() | ||
from gevent.pool import Pool | ||
from queue import Queue | ||
import requests | ||
from lxml import etree | ||
import os | ||
import sys | ||
|
||
""" | ||
info: | ||
author:CriseLYJ | ||
github:https://github.com/CriseLYJ/ | ||
update_time:2019-3-9 | ||
""" | ||
page_init = "https://www.qiushibaike.com/text/" | ||
joke_init = "https://www.qiushibaike.com/article/" | ||
SAVE_PATH = os.path.join(os.getcwd(), 'jokes/') | ||
|
||
|
||
class Spider(object): | ||
def __init__(self, page_num): | ||
self.page_num = int(page_num) | ||
# 第一页特殊处理 | ||
self.page_urls = ["https://www.qiushibaike.com/text/"] | ||
# page_url -> joke_url | ||
self.joke_urls = [] | ||
# joke_url -> joke_name joke_content | ||
# dict name : content | ||
self.joke_content = "" | ||
self.joke_id = 1; | ||
try: | ||
os.mkdir(SAVE_PATH) | ||
except Exception as e: | ||
print(e) | ||
|
||
def get_page_urls(self): | ||
if self.page_num > 1: | ||
# 通过遍历获取到链接 | ||
for n in range(2, self.page_num + 1): | ||
page_url = page_init + 'page/' + str(n) + '/' | ||
self.page_urls.append(page_url) | ||
|
||
def get_joke_urls(self): | ||
for page_url in self.page_urls: | ||
html = requests.get(page_url).content | ||
selector = etree.HTML(html) | ||
qiushi_id = selector.xpath('/html/body/div[@id="content"]/div/div[@id="content-left"]/div/@id') | ||
for q_id in qiushi_id: | ||
id = q_id.split('_')[2] | ||
joke_url = joke_init + id + '/' | ||
print(joke_url) | ||
self.joke_urls.append(joke_url) | ||
|
||
def get_joke(self): | ||
for joke_url in self.joke_urls: | ||
html = requests.get(joke_url).content | ||
selector = etree.HTML(html) | ||
one_joke = selector.xpath('//div[@class="word"]/div/text()') | ||
self.joke_content = "" | ||
for words in one_joke: | ||
self.joke_content += words + '\n' | ||
self.download() | ||
|
||
def download(self): | ||
joke_path = SAVE_PATH + str(self.joke_id) + '.txt' | ||
self.joke_id += 1 | ||
# 笑话路径 | ||
print(joke_path) | ||
with open(joke_path, "w") as f: | ||
f.write(self.joke_content) | ||
|
||
def start(self): | ||
# 获取主页url | ||
self.get_page_urls() | ||
# 获取笑话链接 | ||
self.get_joke_urls() | ||
# 获取笑话 | ||
self.get_joke() | ||
# 调用下载接口 | ||
self.download() | ||
|
||
class QiushiSpider(): | ||
|
||
if __name__ == '__main__': | ||
# 获取账号 | ||
page_num = input('请告诉我:你想获取多少页的糗事?') | ||
def __init__(self, max_page): | ||
self.max_page = max_page | ||
# 2. 创建线程池,初始化线程数量 | ||
self.pool = Pool(5) | ||
|
||
self.base_url = "http://www.qiushibaike.com/8hr/page/{}/" | ||
self.headers = { | ||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" | ||
} | ||
|
||
# 专门存放 url 容器 | ||
self.url_queue = Queue() | ||
pass | ||
|
||
def get_url_list(self): | ||
''' | ||
获取 url 列表放入到 url 容器中 | ||
:return: | ||
''' | ||
for page in range(1,self.max_page,1): | ||
url = self.base_url.format(page) | ||
self.url_queue.put(url) | ||
|
||
# 3. 实现执行任务 | ||
def exec_task(self): | ||
# 1> 获取url | ||
url = self.url_queue.get() | ||
|
||
# 2> 发送请求获取 html | ||
response = requests.get(url,headers=self.headers) | ||
html = response.text | ||
|
||
# 3> 解析 html 提取数据 | ||
eroot = etree.HTML(html) | ||
|
||
titles = eroot.xpath('//a[@class="recmd-content"]/text()') | ||
for title in titles: | ||
item = {} | ||
item["title"] = title | ||
|
||
qb = Spider(page_num) | ||
# 启动爬虫程序 | ||
qb.start() | ||
# 4> 保存数据 | ||
print(item) | ||
self.url_queue.task_done() | ||
|
||
|
||
|
||
# 4. 实现执行任务完成后的操作,必须至少有一个参数 | ||
# result 任务执行的最终结果的返回值 | ||
def exec_task_finished(self,result): | ||
print("result:",result) | ||
print("执行任务完成") | ||
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) | ||
|
||
|
||
def run(self): | ||
|
||
self.get_url_list() | ||
|
||
# 5. 让任务使用线程池中的线程执行并且设置执行后的回调操作 | ||
# callback 表示执行完成后的回调 | ||
for i in range(5): | ||
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) | ||
self.url_queue.join() | ||
pass | ||
|
||
if __name__ == '__main__': | ||
max_page = input("请输入您需要多少页内容:") | ||
spider = QiushiSpider(int(max_page)) | ||
spider.run() |