In [2]:
import requests
import pandas
import json
import re
import time
import traceback
from bs4 import BeautifulSoup
from datetime import datetime


def wordToLevel(remark_level_text):
    '''评分文本对应的评分等级'''
    level_dict = {'很差': 1, '较差': 2, '还行': 3, '推荐': 4, '力荐': 5}
    return level_dict[remark_level_text]

def getProxyIP():
    '''获取代理IP'''
    proxies = {"http": "http://27.40.137.130:61234", 
               "https": "http://113.218.216.226:8888", }
    return proxies


def getRemarkDetails(movie_url):
    '''在电影的评论页面，抓取以下信息：项目、用户、评论、评分、评论时间、赞数、踩数、回应数'''
    res = requests.get(movie_url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    results = []
    remarks = soup.select('.review-item')
    for remark in remarks:
        movie_dict = {}
        movie_dict['project_name'] = soup.select('h1 span')[0].text
        movie_dict['user_name'] = remark.select('.name')[0].text
        
        source_remark_comment = remark.select('.short-content')[0].text.strip()\
        .lstrip('这篇影评可能有剧透\n\n                           ')\
        .rstrip('\n\n                        \xa0(展开)')
        try:
            remark_comment = re.search('(.*。).*...', source_remark_comment).group(1)
        except:
            remark_comment = source_remark_comment
        finally:
            movie_dict['remark'] = remark_comment
            
        movie_dict['rating'] = wordToLevel(remark.select('.main-hd span')[0]['title'])
        time_source = remark.select('.main-hd span')[1].text
        movie_dict['time'] = datetime.strptime(time_source, '%Y-%m-%d %H:%M:%S')\
        .strftime("%Y-%m-%d %H:%M:%S")
        movie_dict['good'] = remark.select('.action span')[0].text.strip()
        movie_dict['bad'] = remark.select('.action span')[1].text.strip()
        movie_dict['reply'] = remark.select('.reply')[0].text.rstrip('回应')
        results.append(movie_dict)
    return results


def getOneMovieUserInteract(movie_url):
    '''在电影的交互页面，获取用户互相评论的信息：项目、主用户、回应用户、回应时间'''
    res = requests.get(movie_url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    results = []
    for each in soup.select('.comment-item'):
        if len(each.select('.reply-quote')) > 0:
            interact_dict = {}
            interact_dict['project_name'] = soup.select('.main-hd a')[1].text
            interact_dict['main_user'] = each.select('.header a')[0].text
            interact_dict['reply_user'] = each.select('.pubdate a')[0].text
            time_source = each.select('.header span')[0].text
            interact_dict['reply_time'] = datetime.strptime(time_source, '%Y-%m-%d %H:%M:%S')\
            .strftime("%Y-%m-%d %H:%M:%S")
            results.append(interact_dict)
    return results


def getUserInteracts(movie_url):
    '''在电影的评论页面，获取所有用户互相评论的信息'''
    res = requests.get(movie_url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    movie_url_frame = 'https://movie.douban.com/review/{}/#comments'
    results = []
    for each in soup.select('.action'):
        remark_id = re.search('.*review/(.*)/.*', each.select('a')[:-1][-1]['href']).group(1)
        movie_url = movie_url_frame.format(remark_id)
        interact_list = getOneMovieUserInteract(movie_url)
        if len(results) + len(interact_list) > 50:
            break
        results.extend(interact_list)
    return results


def get_RemarkDetails_UserInteracts(movie_all_url_frame):
    '''在所有电影的首页，抓取电影的id构成单个电影的评论页面后，进行单页抓取评论信息和交互信息'''
    movie_url_frame = 'https://movie.douban.com/subject/{}/?tag=%E7%BB%8F%E5%85%B8&from=gaia_video'
    remark_details = []
    user_interacts = []
    
    for page_start_index in range(120, 140, 20):
        movie_all_url = movie_all_url_frame.format(page_start_index)
        res = requests.get(movie_all_url)
        try:
            jname = json.loads(res.text)
            for each in jname['subjects']:
                each_url_id = re.search('.*subject/(.*)/', each['url']).group(1)
                movie_url = movie_url_frame.format(each_url_id)
                remark_details.extend(getRemarkDetails(movie_url))
                user_interacts.extend(getUserInteracts(movie_url))
        except Exception as e:
            print(traceback.format_exc())
    return remark_details, user_interacts

time_start = time.time()

movie_all_url_frame = 'https://movie.douban.com/j/search_subjects?type=movie\
&tag=%E5%8F%AF%E6%92%AD%E6%94%BE&sort=recommend&page_limit=20&page_start={}'
(all_remarks, all_interacts) = get_RemarkDetails_UserInteracts(movie_all_url_frame)
df_remarks = pandas.DataFrame(all_remarks)
df_interacts = pandas.DataFrame(all_interacts)
df_remarks.to_excel('remarks_short_copy.xlsx')
df_interacts.to_excel('interacts_short_copy.xlsx')

time_end = time.time()
sum_time = time_end - time_start
print("总用时：" + str(sum_time) + "s")

总用时：128.2398238182068s


In [3]:
df_remarks

Unnamed: 0,bad,good,project_name,rating,remark,reply,time,user_name
0,141,944,生吃 Grave,5,如果没有最后一个镜头，我大概不会看第二遍，而看了第二遍之后，才发现先前对妹妹的想法理解彻底错...,107,2017-04-17 07:32:21,安神补脑meng
1,64,493,生吃 Grave,5,走在柯士甸道的天桥上，我还散发着与午夜场结束后的冷风所不相称的阵阵体热。环视周围一起退场的观...,28,2017-05-01 11:33:38,Lime
2,15,354,生吃 Grave,5,【剧透注意】 表面上看这是个素食家庭出来的女孩儿在兽医医学院里吃过一口兔腰之后，压抑多年的食...,66,2017-05-25 22:30:24,津轻
3,31,231,生吃 Grave,5,就一些个人的感受。虽然经历不同，但是这些内心活动和情绪我太熟悉了。这个电影虽然变态恶心，但是...,12,2017-04-23 00:57:39,Wunching
4,16,52,生吃 Grave,4,食与色的关系，不是已经有过很多论述？ “食色，性也。”说明食与色乃人的本性。 两者往往联系在...,0,2017-05-24 17:46:37,把噗
5,,18,生吃 Grave,3,情其实蛮一般的，但是整体还可以。电影是以女性视角来展示的，具有现实的意义。,0,2017-11-01 17:05:50,是巴卫啊
6,9,30,生吃 Grave,4,本文首发于【MOVIE木卫】（微信公号：movie345） 如果伊藤润二的漫画拍成了电影，恐...,6,2017-05-25 23:19:57,嘚嘚嘚嘚嘚
7,28,35,生吃 Grave,4,我为什么在大半夜看这部电影，因为舍友在农药声音太烦了打扰我睡觉，但是看完了趁我还记得就写写吧...,15,2017-06-24 02:10:06,棠梨煎雪
8,1,17,生吃 Grave,4,一个来自严格素食主义家庭的少女，在进入兽医学校求学之后，因为学校对待菜鸟新生传统被迫生吃了兔...,0,2017-07-29 11:25:49,(ｰ̀εｰ́)
9,1,14,生吃 Grave,3,电影的主线就是疾病引发的系列恐怖事件。俩姐妹很不幸遗传了妈妈的食人基因，妈妈对自己病情的控制...,0,2017-07-27 22:06:22,槑槑


In [50]:
df_interacts

Unnamed: 0,main_user,project_name,reply_time,reply_user
0,ecila,肖申克的救赎,2013-11-25 14:55:58,铅笔
1,心底住着小新,肖申克的救赎,2014-02-02 21:01:31,就为一条评价
2,印欧语系差生,肖申克的救赎,2015-05-28 02:25:09,无谓激情
3,低端ㅅㅁLone,肖申克的救赎,2015-06-27 22:12:13,小High
4,热狗扑食,肖申克的救赎,2015-07-13 18:14:11,雨啊雨
5,奶奶,肖申克的救赎,2016-01-12 11:20:56,一瓢碧水
6,奶奶,肖申克的救赎,2016-01-12 11:22:52,凉冷的夏
7,xigeling,肖申克的救赎,2013-11-12 08:53:11,Eclipse
8,Eclipse,肖申克的救赎,2013-11-12 12:02:12,xigeling
9,用户9726,肖申克的救赎,2017-02-27 01:33:37,Eclipse
