/
phoenix.py
78 lines (61 loc) · 2.22 KB
/
phoenix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
import time
from pathlib import Path
import os
base_url = 'http://zt.ppmg.cn/textbook/'
base_path = '江苏凤凰传媒版/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
}
def dd(param, breakdown = 1):
print(param)
if breakdown:
exit()
def getCateList():
print('开始获取分类列表...')
response = requests.get(base_url, headers=headers)
response.encoding= 'utf8'
soup = BeautifulSoup(response.text, 'lxml')
cate_list = []
boxes = soup.find_all('div', class_='box')
for box in boxes:
cate_title = box.find('div', class_='small-head').text.replace(' ', '-').strip()
temp = {}
temp['cate_title'] = cate_title
temp['book_list'] = []
uls = box.find('div', class_='list-ul-small').find_all('ul')
for ul in uls:
lis = ul.find_all('li')
for li in lis:
t = {}
t['href'] = li.find('a')['href']
t['book_name'] = li.find('a').text.replace(' ', '-')
(temp['book_list']).append(t)
cate_list.append(temp)
return cate_list
cate_list = getCateList()
print('开始下载...')
for cate in cate_list:
# 创建分类目录
Path(base_path + cate['cate_title']).mkdir(parents=True, exist_ok=True)
print('\n')
print('下载分类:%s...' % cate['cate_title'])
print('\n')
for book in cate['book_list']:
url = base_url + book['href']
bookname = book['book_name'] + '.pdf'
full_file_path = base_path + cate['cate_title'] + '/' + bookname + '.pdf'
# 文件已存在则跳出
if os.path.isfile(full_file_path):
print(full_file_path, ' 文件已存在')
continue
responsePDF = requests.get(url, headers=headers)
with open(full_file_path,'wb') as f:
for chunk in responsePDF.iter_content(128):
f.write(chunk)
f.close()
print(bookname, '下载成功!')