# 5分钟使用Python爬取豆瓣TOP250电影榜

本视频的演示步骤：

1. 使用requests爬取网页
2. 使用BeautifulSoup实现数据解析
3. 借助pandas将数据写出到Excel

这三个库的详细用法，请看我的其他视频课程

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## 1、下载共10个页面的HTML

In [4]:
# 构造分页数字列表
page_indexs = range(0, 250, 25)

In [5]:
list(page_indexs)

[0, 25, 50, 75, 100, 125, 150, 175, 200, 225]

In [6]:
def download_all_htmls():
    """
    下载所有列表页面的HTML，用于后续的分析
    """
    htmls = []
    for idx in page_indexs:
        url = f"https://movie.douban.com/top250?start={idx}&filter="
        print("craw html:", url)
        r = requests.get(url,
                        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"})
        if r.status_code != 200:
            raise Exception("error")
        htmls.append(r.text)
    return htmls

In [7]:
# 执行爬取
htmls = download_all_htmls()

craw html: https://movie.douban.com/top250?start=0&filter=
craw html: https://movie.douban.com/top250?start=25&filter=
craw html: https://movie.douban.com/top250?start=50&filter=
craw html: https://movie.douban.com/top250?start=75&filter=
craw html: https://movie.douban.com/top250?start=100&filter=
craw html: https://movie.douban.com/top250?start=125&filter=
craw html: https://movie.douban.com/top250?start=150&filter=
craw html: https://movie.douban.com/top250?start=175&filter=
craw html: https://movie.douban.com/top250?start=200&filter=
craw html: https://movie.douban.com/top250?start=225&filter=


In [8]:
htmls[0]

'<!DOCTYPE html>\n<html lang="zh-CN" class="ua-windows ua-webkit">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n    <meta name="renderer" content="webkit">\n    <meta name="referrer" content="always">\n    <meta name="google-site-verification" content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" />\n    <title>\n豆瓣电影 Top 250\n</title>\n    \n    <meta name="baidu-site-verification" content="cZdR4xxR7RxmM4zE" />\n    <meta http-equiv="Pragma" content="no-cache">\n    <meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT">\n    \n    <link rel="apple-touch-icon" href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png">\n    <link href="https://img3.doubanio.com/f/shire/3e5dfc68b0f376484c50cf08a58bbca3700911dc/css/douban.css" rel="stylesheet" type="text/css">\n    <link href="https://img3.doubanio.com/f/shire/ae3f5a3e3085968370b1fc63afcecb22d3284848/css/separation/_all.css" rel="style

## 2、解析HTML得到数据

In [9]:
def parse_single_html(html):
    """
    解析单个HTML，得到数据
    @return list({"link", "title", [label]})
    """
    soup = BeautifulSoup(html, 'html.parser')
    article_items = (
        soup.find("div", class_="article")
            .find("ol", class_="grid_view")
            .find_all("div", class_="item")
    )
    datas = []
    for article_item in article_items:
        rank = article_item.find("div", class_="pic").find("em").get_text()
        info = article_item.find("div", class_="info")
        title = info.find("div", class_="hd").find("span", class_="title").get_text()
        stars = (
            info.find("div", class_="bd")
                .find("div", class_="star")
                .find_all("span")
        )
        rating_star = stars[0]["class"][0]
        rating_num = stars[1].get_text()
        comments = stars[3].get_text()
        
        datas.append({
            "rank":rank,
            "title":title,
            "rating_star":rating_star.replace("rating","").replace("-t",""),
            "rating_num":rating_num,
            "comments":comments.replace("人评价", "")
        })
    return datas



In [8]:
import pprint
pprint.pprint(parse_single_html(htmls[0]))

[{'comments': '1790062',
  'rank': '1',
  'rating_num': '9.7',
  'rating_star': '5',
  'title': '肖申克的救赎'},
 {'comments': '1320110',
  'rank': '2',
  'rating_num': '9.6',
  'rating_star': '5',
  'title': '霸王别姬'},
 {'comments': '1376257',
  'rank': '3',
  'rating_num': '9.5',
  'rating_star': '5',
  'title': '阿甘正传'},
 {'comments': '1570867',
  'rank': '4',
  'rating_num': '9.4',
  'rating_star': '45',
  'title': '这个杀手不太冷'},
 {'comments': '879501',
  'rank': '5',
  'rating_num': '9.5',
  'rating_star': '5',
  'title': '美丽人生'},
 {'comments': '1314428',
  'rank': '6',
  'rating_num': '9.4',
  'rating_star': '45',
  'title': '泰坦尼克号'},
 {'comments': '1407622',
  'rank': '7',
  'rating_num': '9.3',
  'rating_star': '45',
  'title': '千与千寻'},
 {'comments': '707102',
  'rank': '8',
  'rating_num': '9.5',
  'rating_star': '5',
  'title': '辛德勒的名单'},
 {'comments': '1337018',
  'rank': '9',
  'rating_num': '9.3',
  'rating_star': '45',
  'title': '盗梦空间'},
 {'comments': '909227',
  'rank': '10',
  'ra

In [10]:
# 执行所有的HTML页面的解析
all_datas = []
for html in htmls:
    all_datas.extend(parse_single_html(html))

In [11]:
all_datas

[{'rank': '1',
  'title': '肖申克的救赎',
  'rating_star': '5',
  'rating_num': '9.7',
  'comments': '1999723'},
 {'rank': '2',
  'title': '霸王别姬',
  'rating_star': '5',
  'rating_num': '9.6',
  'comments': '1480659'},
 {'rank': '3',
  'title': '阿甘正传',
  'rating_star': '5',
  'rating_num': '9.5',
  'comments': '1514680'},
 {'rank': '4',
  'title': '这个杀手不太冷',
  'rating_star': '45',
  'rating_num': '9.4',
  'comments': '1708218'},
 {'rank': '5',
  'title': '美丽人生',
  'rating_star': '5',
  'rating_num': '9.5',
  'comments': '954447'},
 {'rank': '6',
  'title': '泰坦尼克号',
  'rating_star': '45',
  'rating_num': '9.4',
  'comments': '1464216'},
 {'rank': '7',
  'title': '千与千寻',
  'rating_star': '45',
  'rating_num': '9.4',
  'comments': '1566664'},
 {'rank': '8',
  'title': '辛德勒的名单',
  'rating_star': '5',
  'rating_num': '9.5',
  'comments': '772666'},
 {'rank': '9',
  'title': '盗梦空间',
  'rating_star': '45',
  'rating_num': '9.3',
  'comments': '1450577'},
 {'rank': '10',
  'title': '忠犬八公的故事',
  'rati

In [12]:
len(all_datas)

250

## 3、将结果存入excel

In [13]:
df = pd.DataFrame(all_datas)

In [14]:
df

Unnamed: 0,comments,rank,rating_num,rating_star,title
0,1999723,1,9.7,5,肖申克的救赎
1,1480659,2,9.6,5,霸王别姬
2,1514680,3,9.5,5,阿甘正传
3,1708218,4,9.4,45,这个杀手不太冷
4,954447,5,9.5,5,美丽人生
5,1464216,6,9.4,45,泰坦尼克号
6,1566664,7,9.4,45,千与千寻
7,772666,8,9.5,5,辛德勒的名单
8,1450577,9,9.3,45,盗梦空间
9,1005507,10,9.4,45,忠犬八公的故事


In [16]:
df.to_excel("豆瓣电影TOP250.xlsx")