In [3]:
from time import sleep
import requests
from bs4 import BeautifulSoup
import csv
# 最多爬取2500部电影
# 每个电影爬取10条影评
# user-agent
headers = {
    "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 "
        "Safari/537.36 Edg/114.0.1823.58",
    "Connection":
        "keep-alive",
    "Referer":
        "https://www.douban.com"  # 站内访问
}
URL_SET = set()
REVIEW_NUM = 10


In [12]:
# 请求网页封装
def request_douban(url, headers=headers):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print('被拦截了，休息一下')
        exit(0)
    sleep(1)
    return response


In [59]:
# 抓取影评
def crawl_reviews(url, num = REVIEW_NUM):
    # 抓取num条影评
    for i in range(0, num, 20): # 每页20条
        url_review = url + '?start=' + str(i)
        response = request_douban(url_review)
        soup = BeautifulSoup(response.text, 'lxml')

        # 抓取影评
        review_list = []
        review_links = soup.select('div.main.review-item > div.main-bd > h2 > a')
        for index, link in enumerate(review_links):
            if index == num - i: # 说明已经抓取了num条影评
                break
            url_report = link['href']
            response = request_douban(url_report)
            soup = BeautifulSoup(response.text, 'lxml')
            # 处理头部信息
            header = soup.select('div.main > header.main-hd')[0]
            # 作者
            author = header.select('a')[0]
            author_name = author.text.strip()
            # 作者评分
            author_star_span = soup.select('span.main-title-hide')[0]
            author_star = author_star_span.text.strip()

            # 处理主体信息
            main = soup.select('div.main-bd')[0]
            # 影评
            review = main.select('div > div.review-content.clearfix > p')
            review_text = '\n'.join([p.text.strip() for p in review])
            
            # 处理底部信息
            footer = main.select('div.main-panel-useful')[0]
            # 有用数
            useful = footer.select('button.btn.useful_count.j.a_show_login')[0]
            useful_count = useful.text.replace('有用', '').strip()
            # 没用数
            useless = footer.select('button.btn.useless_count.j.a_show_login')[0]
            useless_count = useless.text.replace('没用', '').strip()

            # 综合信息
            review = {
                '作者': author_name,
                '作者评分': author_star,
                '影评': review_text,
                '有用数': useful_count,
                '没用数': useless_count
            }
            review_list.append(review)

            # 暂时加个break
            break
        break
    return review_list


In [120]:
# 爬取一部电影，并返回相关电影的url（加入到URL_SET中）
def crawl_movie(url):
    response = request_douban(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # 处理详情页
    header = soup.select('#content > h1')[0]
    # 爬电影名字
    movie_name_span = header.select('h1 > span')[0]
    movie_name = movie_name_span.text.strip()
    # 爬电影年份
    movie_year_span = header.select('h1 > span.year')[0] 
    movie_year = movie_year_span.text.strip()

    # 爬电影封面
    movie_cover = soup.select('#mainpic > a > img')[0]['src']
    cover_url = movie_cover.strip()
    # 爬评分
    movie_star_rating = soup.select('div.rating_self.clearfix > strong.ll.rating_num')[0]
    movie_star = movie_star_rating.text.strip()
    # 导演
    movie_director_span = soup.select('#info > span > span.attrs')[0]
    movie_director = movie_director_span.text.strip()
    # 爬演员
    movie_actor_span = soup.select('#info > span.actor > span.attrs')[0]
    movie_actor = movie_actor_span.text.strip('/')
    # 爬简介
    intro = soup.find('div', class_='indent', id='link-report-intra')
    intro = intro.select_one('.all.hidden').text.replace(' ', '')
    # 爬影评
    review_url = url + 'reviews'
    reviews = crawl_reviews(review_url)
    # 综合信息
    movie_info = {
        '电影名': movie_name,
        '年份': movie_year,
        '评分': movie_star,
        '封面': cover_url,
        '导演': movie_director,
        '演员': movie_actor,
        '简介': intro,
        '影评': reviews
    }

    return movie_info
# 爬取相关电影，加入到URL_SET中
def crawl_related_movie(url): 
    response = request_douban(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 相关电影
    related_links = soup.select('div.recommendations-bd > dl > dt > a')
    if len(related_links) != 10:
        print('相关电影数目不为10')
    for link in related_links:
        related_url = link['href'].rstrip('?from=subject-page')
        if related_url not in URL_SET:
            url_queue.append(related_url)
            URL_SET.add(related_url)

In [117]:

'''爬取豆瓣电影top250链接'''
def top250_crawer():
    url_list = []
    for i in range(0, 250, 25):
        # 生成url
        url_str = "https://movie.douban.com/top250?start={}".format(i)
        response = request_douban(url_str)
        soup = BeautifulSoup(response.text, 'html.parser')
        movie_items = soup.find_all('div', class_='item')

        for item in movie_items:
            # 加入相关电影的超链接
            a = item.find_all('a')
            url_list.append(a[1]['href'])
    return url_list

In [121]:
top_urls = top250_crawer()
URL_SET = set(top_urls)
# print(URL_SET)
url_queue = top_urls
print(len(URL_SET))
print(len(url_queue))
i = 0
while len(URL_SET) < 2500 and len(url_queue) != 0:
    url = url_queue.pop(0)
    crawl_related_movie(url)
    i += 1
    print('现在的电影数量：', len(URL_SET), '缓冲区长度：', len(url_queue), '迭代次数：', i, end='\r')

250
250
现在的电影数量： 270 缓冲区长度： 242 迭代次数： 28

In [100]:
url_header = ['电影url']
with open('movie_urls', 'a', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=url_header)
    writer.writeheader()
    for url in URL_SET:
        writer.writerow({url_header[0]:url})

In [None]:
dic = crawl_movie(top_urls[0])

In [81]:
print(URL_SET)
print(len(url_queue))

len(URL_SET)

{'https://movie.douban.com/subject/25895901/', 'https://movie.douban.com/subject/1291548/', 'https://movie.douban.com/subject/3541415/', 'https://movie.douban.com/subject/1291585/', 'https://movie.douban.com/subject/1291879/', 'https://movie.douban.com/subject/1293350/', 'https://movie.douban.com/subject/1296996/', 'https://movie.douban.com/subject/27010768/', 'https://movie.douban.com/subject/1297192/', 'https://movie.douban.com/subject/11026735/', 'https://movie.douban.com/subject/1291832/', 'https://movie.douban.com/subject/1292262/', 'https://movie.douban.com/subject/1305164/', 'https://movie.douban.com/subject/1849031/', 'https://movie.douban.com/subject/1300992/', 'https://movie.douban.com/subject/1291549/', 'https://movie.douban.com/subject/5912992/', 'https://movie.douban.com/subject/1308857/', 'https://movie.douban.com/subject/1292224/', 'https://movie.douban.com/subject/1291552/', 'https://movie.douban.com/subject/1292000/', 'https://movie.douban.com/subject/1307315/', 'https

250

In [None]:
dict_header = ['电影名', '年份', '评分', '封面', '导演', '演员', '简介', '影评']

In [105]:
print(len(url_queue))
print(len(URL_SET))

247
250


In [119]:
print(len(url_queue))
url = url_queue.pop(0)
print(len(url_queue))
crawl_related_movie(url)
print(len(url_queue))


241
240
240影数 10
