In [2]:
from time import sleep
import requests
from bs4 import BeautifulSoup
import csv
# 最多爬取2500部电影
# 每个电影爬取10条影评
# user-agent
headers = {
    "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 "
        "Safari/537.36 Edg/114.0.1823.58",
    "Connection":
        "keep-alive",
    "Referer":
        "https://www.douban.com"  # 站内访问
}
URL_SET = set()
REVIEW_NUM = 10
DATA_PATH = './data/'

In [3]:
# 请求网页封装
def request_douban(url, headers=headers):
    while True:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print('被拦截了，休息一下')
            sleep(600)
        else:
            break
    sleep(1)
    return response


In [4]:
# 抓取影评
def crawl_reviews(url, num = REVIEW_NUM):
    try:
        # 抓取num条影评
        review_list = []
        for i in range(0, num, 20): # 每页20条
            url_review = url + '?start=' + str(i)
            response = request_douban(url_review)
            soup = BeautifulSoup(response.text, 'lxml')

            # 抓取影评
            review_links = soup.select('div.main.review-item > div.main-bd > h2 > a')
            if len(review_links) == 0:
                return review_list
            for index, link in enumerate(review_links):
                if index == num - i: # 说明已经抓取了num条影评
                    break
                url_report = link['href']
                response = request_douban(url_report)
                soup = BeautifulSoup(response.text, 'lxml')
                # 处理头部信息
                header = soup.select('div.main > header.main-hd')[0]
                # 作者
                author = header.select('a')[0]
                author_name = author.text.strip()
                # 时间
                time_span = header.select('div.main-meta > span')[0]
                time = time_span.text.strip()

                # 作者评分
                try:
                    author_star_span = soup.select('span.main-title-hide')[0]
                    author_star = author_star_span.text.strip()
                except:
                    author_star = ''

                # 处理主体信息
                main = soup.select('div.main-bd')[0]
                # 影评
                review = main.select('div > div.review-content.clearfix')[0]
                review_text = '\n'.join([r.text for r in review])
            
                # 处理底部信息
                footer = main.select('div.main-panel-useful')[0]
                # 有用数
                useful = footer.select('button.btn.useful_count.j.a_show_login')[0]
                useful_count = useful.text.replace('有用', '').strip()
                # 没用数
                useless = footer.select('button.btn.useless_count.j.a_show_login')[0]
                useless_count = useless.text.replace('没用', '').strip()

                # 综合信息
                review = {
                    '作者': author_name,
                    '作者评分': author_star,
                    '时间': time,
                    '影评': review_text,
                    '有用数': useful_count,
                    '没用数': useless_count
                }
                review_list.append(review)

                # 检查各部分信息
                for key, value in review.items():
                    if value == '':
                        if key == '作者评分':
                            continue
                        print('读取影评时,', key, '为空')
                        print(url_report)
        return review_list
    except Exception as e:
        print('读取影评时出错:', e)
        print(url_report)
        return review_list


In [5]:
# 爬取一部电影，并返回相关电影的url（加入到URL_SET中）
def crawl_movie(url):
    response = request_douban(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # 处理详情页
    header = soup.select('#content > h1')[0]
    # 爬电影名字
    movie_name_span = header.select('h1 > span')[0]
    movie_name = movie_name_span.text.strip()
    # 爬电影年份
    movie_year_span = header.select('h1 > span.year')[0] 
    movie_year = movie_year_span.text.strip('()')

    # 爬电影封面
    movie_cover = soup.select('#mainpic > a > img')[0]['src']
    cover_url = movie_cover.strip()
    # 爬评分
    movie_star_rating = soup.select('div.rating_self.clearfix > strong.ll.rating_num')[0]
    movie_star = movie_star_rating.text.strip()
    # 导演
    movie_director_span = soup.select('#info > span > span.attrs')[0]
    movie_director = movie_director_span.text.strip()
    # 爬演员
    try:
        movie_actor_span = soup.select('#info > span.actor > span.attrs')[0]
        movie_actor = movie_actor_span.text.strip('/')
    except:
        movie_actor = ''
    # 爬简介
    intro_span_all0 = soup.select('#link-report-intra.indent > span.all.hidden')
    intro_span_all1 = soup.select('#link-report-intra.indent > span')
    try:
        intro_span = intro_span_all1[0] if len(intro_span_all0) == 0 else intro_span_all0[0]
        intro = intro_span.text.strip()
    except:
        intro = ''
    # 爬影评
    review_url = url + 'reviews'
    reviews = crawl_reviews(review_url)

    # 综合信息
    movie_info = {
        '电影名': movie_name,
        '年份': movie_year,
        '评分': movie_star,
        '封面': cover_url,
        '导演': movie_director,
        '演员': movie_actor,
        '简介': intro,
    }
    if '豆瓣' in intro:
        print('简介中有豆瓣')
        print(url)
    for key, value in movie_info.items():
        if value == '':
            print(key, '为空')
            print(url)

    return (movie_info, reviews)
# 爬取相关电影，加入到URL_SET中
def crawl_related_movie(url): 
    response = request_douban(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 相关电影
    related_links = soup.select('div.recommendations-bd > dl > dt > a')
    if len(related_links) != 10:
        print('相关电影数目不为10')
        print(url)
    for link in related_links:
        related_url = link['href'].rstrip('?from=subject-page')
        if related_url not in URL_SET:
            url_queue.append(related_url)
            URL_SET.add(related_url)

In [6]:
'''爬取豆瓣电影top250链接'''
def top250_crawer():
    url_list = []
    for i in range(0, 250, 25):
        # 生成url
        url_str = "https://movie.douban.com/top250?start={}".format(i)
        response = request_douban(url_str)
        soup = BeautifulSoup(response.text, 'html.parser')
        movie_items = soup.find_all('div', class_='item')

        for item in movie_items:
            # 加入相关电影的超链接
            a = item.find_all('a')
            url_list.append(a[1]['href'])
    return url_list

In [121]:
top_urls = top250_crawer()
URL_SET = set(top_urls)
# print(URL_SET)
url_queue = top_urls
print(len(URL_SET))
print(len(url_queue))
i = 0
while len(URL_SET) < 2500 and len(url_queue) != 0:
    url = url_queue.pop(0)
    crawl_related_movie(url)
    i += 1
    print('现在的电影数量：', len(URL_SET), '缓冲区长度：', len(url_queue), '迭代次数：', i, end='\r')

250
250
相关电影数目不为1089 缓冲区长度： 416 迭代次数： 373
相关电影数目不为10333 缓冲区长度： 1137 迭代次数： 1196
相关电影数目不为10362 缓冲区长度： 1144 迭代次数： 1218
现在的电影数量： 2503 缓冲区长度： 1216 迭代次数： 1287

In [122]:
url_header = ['电影url']
with open(DATA_PATH + 'movie_urls.csv', 'a', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=url_header)
    writer.writeheader()
    for url in URL_SET:
        writer.writerow({url_header:url})

In [7]:
def write_reviews_in_xml(f, review_list):
    f.write('<reviews>\n')
    for review in review_list:
        f.write('<review>\n')
        for key, value in review.items():
            f.write('<' + key + '>')
            f.write(value)
            f.write('</' + key + '>\n')
        f.write('</review>\n')
    f.write('</reviews>\n')
def write_movie_in_xml(dic, review_list, index, doc_dir_path):
    with open(doc_dir_path + '{}.xml'.format(index), 'w', encoding='utf-8') as f:
        f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        f.write('<movie>\n')
        for key, value in dic.items():
            f.write('<' + key + '>')
            f.write(value)
            f.write('</' + key + '>\n')
        write_reviews_in_xml(f, review_list)
        f.write('</movie>\n')

In [8]:
# 读取所有电影的url
url_list = []
with open(DATA_PATH + 'movie_urls.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        url = row['电影url']
        url_list.append(url)
print(len(url_list))

2503


In [9]:
delta = 247
for index, url in enumerate(url_list[delta:]):
    index += delta
    print('crawling', index, '...', end='\r')
    dic, reviews = crawl_movie(url)
    write_movie_in_xml(dic, reviews, index, DATA_PATH + 'movies/')

crawling 248 ...

In [9]:
dic, reviews = crawl_movie(url_list[125])
write_movie_in_xml(dic, reviews, 125, './data/movies/')

被拦截了，休息一下


In [12]:
write_movie_in_xml(dic, reviews, 90, './data/movies/')

In [11]:
url_list[125]

'https://movie.douban.com/subject/26630167/'