# 爬取豆瓣电影Top250榜单

## 1.下载十个页面的HTML内容

In [20]:
import requests

url_base = 'https://movie.douban.com/top250'


def download_htmls():
    page_index = range(0, 250, 25)
    htmls = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    for i in page_index:
        url = url_base + '?start=' + str(i) + '&filter='
        print("craw html: ", url)
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise RuntimeError("Download failed")
        htmls.append(response.text)
    return htmls

In [21]:
htmls = download_htmls()

craw html:  https://movie.douban.com/top250?start=0&filter=
craw html:  https://movie.douban.com/top250?start=25&filter=
craw html:  https://movie.douban.com/top250?start=50&filter=
craw html:  https://movie.douban.com/top250?start=75&filter=
craw html:  https://movie.douban.com/top250?start=100&filter=
craw html:  https://movie.douban.com/top250?start=125&filter=
craw html:  https://movie.douban.com/top250?start=150&filter=
craw html:  https://movie.douban.com/top250?start=175&filter=
craw html:  https://movie.douban.com/top250?start=200&filter=
craw html:  https://movie.douban.com/top250?start=225&filter=


## 2.解析HTML内容

In [22]:
from bs4 import BeautifulSoup as bs


def parse_html(html):
    soup = bs(html, 'html.parser')
    article_items = (
        soup.find('div', class_='article')
        .find('ol', class_='grid_view')
        .find_all('div', class_='item')
    )
    data = []
    for item in article_items:
        rank = item.find('div', class_='pic').find('em').get_text()
        info = item.find('div', class_='info')
        title = info.find('div', class_='hd').find('span', class_='title').get_text()
        stars = (
            info.find('div', class_='bd')
            .find('div', class_='star')
            .find_all('span')
        )
        rating_star = stars[0]['class'][0]
        rating_score = stars[1].get_text()
        comments = stars[3].get_text()
        data.append({
            "rank": rank,
            "title": title,
            "rating_star": rating_star.replace("rating", "").replace("-t", ""),
            "rating_score": rating_score,
            "comments": comments.replace("人评价", "")
        })
    return data

In [23]:
import pprint

pprint.pprint(parse_html(htmls[0]))

[{'comments': '3039510',
  'rank': '1',
  'rating_score': '9.7',
  'rating_star': '5',
  'title': '肖申克的救赎'},
 {'comments': '2246406',
  'rank': '2',
  'rating_score': '9.6',
  'rating_star': '5',
  'title': '霸王别姬'},
 {'comments': '2264643',
  'rank': '3',
  'rating_score': '9.5',
  'rating_star': '5',
  'title': '阿甘正传'},
 {'comments': '2304092',
  'rank': '4',
  'rating_score': '9.5',
  'rating_star': '5',
  'title': '泰坦尼克号'},
 {'comments': '2352968',
  'rank': '5',
  'rating_score': '9.4',
  'rating_star': '45',
  'title': '千与千寻'},
 {'comments': '2389998',
  'rank': '6',
  'rating_score': '9.4',
  'rating_star': '45',
  'title': '这个杀手不太冷'},
 {'comments': '1385643',
  'rank': '7',
  'rating_score': '9.5',
  'rating_star': '5',
  'title': '美丽人生'},
 {'comments': '1970877',
  'rank': '8',
  'rating_score': '9.4',
  'rating_star': '45',
  'title': '星际穿越'},
 {'comments': '2166794',
  'rank': '9',
  'rating_score': '9.4',
  'rating_star': '45',
  'title': '盗梦空间'},
 {'comments': '1824432',
  

In [24]:
all_data = []
for html in htmls:
    all_data.extend(parse_html(html))
all_data

[{'rank': '1',
  'title': '肖申克的救赎',
  'rating_star': '5',
  'rating_score': '9.7',
  'comments': '3039510'},
 {'rank': '2',
  'title': '霸王别姬',
  'rating_star': '5',
  'rating_score': '9.6',
  'comments': '2246406'},
 {'rank': '3',
  'title': '阿甘正传',
  'rating_star': '5',
  'rating_score': '9.5',
  'comments': '2264643'},
 {'rank': '4',
  'title': '泰坦尼克号',
  'rating_star': '5',
  'rating_score': '9.5',
  'comments': '2304092'},
 {'rank': '5',
  'title': '千与千寻',
  'rating_star': '45',
  'rating_score': '9.4',
  'comments': '2352968'},
 {'rank': '6',
  'title': '这个杀手不太冷',
  'rating_star': '45',
  'rating_score': '9.4',
  'comments': '2389998'},
 {'rank': '7',
  'title': '美丽人生',
  'rating_star': '5',
  'rating_score': '9.5',
  'comments': '1385643'},
 {'rank': '8',
  'title': '星际穿越',
  'rating_star': '45',
  'rating_score': '9.4',
  'comments': '1970877'},
 {'rank': '9',
  'title': '盗梦空间',
  'rating_star': '45',
  'rating_score': '9.4',
  'comments': '2166794'},
 {'rank': '10',
  'title': 

In [25]:
len(all_data)

250

## 3.将结果存为表格

In [26]:
import pandas as pd

df = pd.DataFrame(all_data)
df

Unnamed: 0,rank,title,rating_star,rating_score,comments
0,1,肖申克的救赎,5,9.7,3039510
1,2,霸王别姬,5,9.6,2246406
2,3,阿甘正传,5,9.5,2264643
3,4,泰坦尼克号,5,9.5,2304092
4,5,千与千寻,45,9.4,2352968
...,...,...,...,...,...
245,246,谍影重重,45,8.6,451176
246,247,阿飞正传,45,8.5,536763
247,248,朗读者,45,8.6,468593
248,249,隐藏人物,45,8.9,236184


In [27]:
df.to_csv('movie_top250.csv', index=False)

In [29]:
df.to_excel('movie_top250.xlsx', index=False)