-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
45 lines (41 loc) · 1.71 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from login import DouBan
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
class Crawler:
def __init__(self):
self.comment_url = 'https://movie.douban.com/subject/26357307/comments?start=%d&limit=20&sort=new_score&status=P'
self.comment_count = 500
self.douban = DouBan()
self.get_comments()
def get_comments(self):
comments = {'users': [], 'ratings': [], 'shorts': [], 'times': []}
for i in range(0, 500, 20):
time.sleep(random.random())
url = self.comment_url % i
response = self.douban.get_html(url)
print('进度', i, '条', '状态是:', response.status_code)
soup = BeautifulSoup(response.text)
for comment in soup.select('.comment-item'):
try:
user = comment.select('.comment-info a')[0].text
rating = comment.select('.rating')[0]['class'][0][7:8]
short = comment.select('.short')[0].text
t = comment.select('.comment-time')[0].text.strip()
# print(user, rating, short, t)
except:
continue
else:
comments['users'].append(user)
comments['ratings'].append(rating)
comments['shorts'].append(short)
comments['times'].append(t)
# break
comments_pd = pd.DataFrame(comments)
# 保存完整短评信息
comments_pd.to_csv('comments.csv')
# 仅保存评论,作为后续分词的数据源
comments_pd['shorts'].to_csv('shorts.csv', index=False)
if __name__ == '__main__':
crawler = Crawler()