# 成為資料分析師 | Python 與資料科學應用

> 網頁資料擷取：隨堂練習參考解答

## 郭耀仁

In [1]:
import requests
from xml.etree import ElementTree as etree
from bs4 import BeautifulSoup
import time
import random

## 隨堂練習：2019-2020 球季 NBA 有幾支球隊？

In [2]:
def number_of_nba_teams(request_url):
    """
    >>> number_of_nba_teams("http://data.nba.net/prod/v2/2019/teams.json")
    30
    """
    response = requests.get(request_url)
    response_json = response.json()
    teams = response_json["league"]["standard"]
    n_nba_teams = 0
    for t in teams:
        if t["isNBAFranchise"]:
            n_nba_teams += 1
    return n_nba_teams

## 隨堂練習：divName 為 Atlantic 與 Southwest 的球隊有哪些？

In [3]:
def find_atlantic_southwest_teams(request_url):
    """
    >>> atlantic_southwest_teams = find_atlantic_southwest_teams("http://data.nba.net/prod/v2/2019/teams.json")
    >>> atlantic_southwest_teams['Atlantic']
    ['Boston Celtics', 'Brooklyn Nets', 'New York Knicks', 'Philadelphia 76ers', 'Toronto Raptors']
    >>> atlantic_southwest_teams['Southwest']
    ['Dallas Mavericks', 'Houston Rockets', 'Memphis Grizzlies', 'New Orleans Pelicans', 'San Antonio Spurs']
    """
    response = requests.get(request_url)
    response_json = response.json()
    teams = response_json["league"]["standard"]
    team_dict = dict()
    for t in teams:
        div = t["divName"]
        full_name = t["fullName"]
        if div in team_dict:
            team_dict[div].append(full_name)
        else:
            team_dict[div] = [full_name]
    return team_dict

## 隨堂練習：擷取台北市所有 7-11 商店資訊

In [4]:
def get_tpe_711_stores(request_url):
    """
    >>> tpe_711_stores = get_tpe_711_stores("https://emap.pcsc.com.tw/EMapSDK.aspx")
    >>> tpe_711_stores["松山區"][0]
    {'POIID': '170945', 'POIName': '上弘', 'Longitude': 121.548287390895, 'Latitude': 25.056390968531797, 'Address': '台北市松山區敦化北路168號B2'}
    >>> tpe_711_stores["信義區"][0]
    {'POIID': '167651', 'POIName': '一零一', 'Longitude': 121.565077, 'Latitude': 25.033373, 'Address': '台北市信義區信義路五段7號35樓'}
    >>> tpe_711_stores["大安區"][0]
    {'POIID': '153319', 'POIName': '大台', 'Longitude': 121.53261437826, 'Latitude': 25.0179598345753, 'Address': '台北市大安區羅斯福路三段283巷14弄16號1樓'}
    """
    form_data = {
        "commandid": "GetTown",
        "cityid": "01"
    }
    response = requests.post(request_url, data=form_data)
    tree = etree.fromstring(response.text)
    town_names = [t.text for t in tree.findall(".//TownName")]
    tpe_711_stores = dict()
    for town in town_names:
        form_data = {
            "commandid": "SearchStore",
            "city": "台北市",
            "town": town
        }
        response = requests.post(request_url, data=form_data)
        tree = etree.fromstring(response.text)
        poi_ids = [t.text.strip() for t in tree.findall(".//POIID")]
        poi_names = [t.text for t in tree.findall(".//POIName")]
        lons = [float(t.text)/1000000 for t in tree.findall(".//X")]
        lats = [float(t.text)/1000000 for t in tree.findall(".//Y")]
        adds = [t.text for t in tree.findall(".//Address")]
        tpe_711_stores[town] = []
        for poi_id, poi_name, lon, lat, add in zip(poi_ids, poi_names, lons, lats, adds):
            store_info = {
                "POIID": poi_id,
                "POIName": poi_name,
                "Longitude": lon,
                "Latitude": lat,
                "Address": add
            }
            tpe_711_stores[town].append(store_info)
        time.sleep(random.randint(1, 5))
    return tpe_711_stores

## 隨堂練習：以 `requests` 搭配 `bs4` 擷取 [Avengers: Endgame (2019)](https://www.imdb.com/title/tt4154796) 的劇情類型

In [5]:
def find_endgame_genre(request_url):
    """
    >>> find_endgame_genre("https://www.imdb.com/title/tt4154796")
    ['Action', 'Adventure', 'Drama']
    """
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text)
    elems = soup.select(".subtext a")
    genre = [e.text for e in elems]
    genre.pop()
    return genre

## 隨堂練習：以 `requests` 搭配 `bs4` 擷取 [Avengers: Endgame (2019)](https://www.imdb.com/title/tt4154796) 的演員陣容

In [6]:
def find_endgame_cast(request_url):
    """
    >>> find_endgame_cast("https://www.imdb.com/title/tt4154796")
    ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly']
    """
    response = requests.get(request_url)
    soup = BeautifulSoup(response.text)
    elems = soup.select(".primary_photo+ td a")
    cast = [e.text.strip() for e in elems]
    return cast

## 隨堂練習：自訂函式 `get_movie_data_from_url(request_url)`

In [7]:
def get_movie_data_from_url(request_url):
    """
    >>> movie_data = get_movie_data_from_url("https://www.imdb.com/title/tt4154796")
    >>> movie_data["moviePoster"]
    'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg'
    >>> movie_data["movieGenre"]
    ['Action', 'Adventure', 'Drama']
    >>> movie_data["movieCast"]
    ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly']
    """
    request_headers = {
        "accept-language": "en,en-US;q=0.9"
    }
    response = requests.get(request_url, headers=request_headers)
    soup = BeautifulSoup(response.text)
    movie_title = soup.select("h1")[0].text.strip().replace("\xa0", "")
    movie_rating = float(soup.select("strong span")[0].text)
    movie_poster = soup.select(".poster img")[0].get("src")
    movie_genre = [e.text for e in soup.select(".subtext a")]
    movie_genre.pop()
    movie_cast = [e.text.strip() for e in soup.select(".primary_photo+ td a")]
    movie_data = {
        'movieTitle': movie_title,
        'movieRating': movie_rating,
        'moviePoster': movie_poster,
        'movieGenre': movie_genre,
        'movieCast': movie_cast
    }
    return movie_data

## 隨堂練習：自訂函式 `get_movie_data_from_title(movie_title)`

In [8]:
def get_movie_data_from_title(movie_title):
    """
    >>> movie_data = get_movie_data_from_title("Avengers: Endgame (2019)")
    >>> movie_data["moviePoster"]
    'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg'
    >>> movie_data["movieGenre"]
    ['Action', 'Adventure', 'Drama']
    >>> movie_data["movieCast"]
    ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly']
    """
    query_string_parameters = {
        'q': movie_title,
        's': 'tt',
        'ttype': 'ft',
        'ref_': 'fn_ft'
    }
    request_headers = {
        "accept-language": "en,en-US;q=0.9"
    }
    request_url = "https://www.imdb.com/find"
    response = requests.get(request_url, params=query_string_parameters)
    soup = BeautifulSoup(response.text)
    search_results = [e.get("href") for e in soup.select(".result_text a")]
    movie_url = "https://www.imdb.com" + search_results[0]
    response = requests.get(movie_url, headers=request_headers)
    soup = BeautifulSoup(response.text)
    movie_title = soup.select("h1")[0].text.strip().replace("\xa0", "")
    movie_rating = float(soup.select("strong span")[0].text)
    movie_poster = soup.select(".poster img")[0].get("src")
    movie_genre = [e.text for e in soup.select(".subtext a")]
    movie_genre.pop()
    movie_cast = [e.text.strip() for e in soup.select(".primary_photo+ td a")]
    movie_data = {
        'movieTitle': movie_title,
        'movieRating': movie_rating,
        'moviePoster': movie_poster,
        'movieGenre': movie_genre,
        'movieCast': movie_cast
    }
    return movie_data

## 隨堂練習：擷取所有華航機上電影清單

<http://www.fantasy-sky.com/ContentList.aspx?section=002>

In [9]:
def get_ca_movie_titles():
    """
    >>> ca_movie_titles = get_ca_movie_titles()
    >>> type(ca_movie_titles)
    list
    """
    ca_movie_urls = ["http://www.fantasy-sky.com/ContentList.aspx?section=002&category=0020{}".format(i) for i in range(1, 5)]
    ca_movie_titles = []
    for ca_url in ca_movie_urls:
        response = requests.get(ca_url, cookies={'COOKIE_LANGUAGE': 'en'})
        soup = BeautifulSoup(response.text)
        movie_titles = [e.text for e in soup.select(".movies-name")]
        ca_movie_titles += movie_titles
    return ca_movie_titles

## 隨堂練習：找出華航機上最高評等的電影

In [10]:
def find_highest_rated_movies():
    """
    >>> find_highest_rated_movies()
    ['Inception']
    """
    ca_movie_titles = get_ca_movie_titles()
    movie_ratings = []
    for movie_title in ca_movie_titles:
        print("正在擷取 {} 的評等".format(movie_title))
        try:
            movie_data = get_movie_data_from_title(movie_title)
            movie_rating = movie_data["movieRating"]
            movie_ratings.append(movie_rating)
        except:
            print("在擷取 {} 的資訊時產生錯誤".format(movie_title))
            movie_ratings.append(0)
    max_movie_rating = max(movie_ratings)
    highest_rated_movies = []
    for rating, title in zip(movie_ratings, ca_movie_titles):
        if rating == max_movie_rating:
            highest_rated_movies.append(title)
    return highest_rated_movies

In [11]:
# %load ../test_cases/test_cases_01.py
import unittest

class TestWebScraping(unittest.TestCase):
    def test_number_of_nba_teams(self):
        self.assertEqual(number_of_nba_teams("http://data.nba.net/prod/v2/2019/teams.json"), 30)
    def test_find_atlantic_southwest_teams(self):
        atlantic_southwest_teams = find_atlantic_southwest_teams("http://data.nba.net/prod/v2/2019/teams.json")
        self.assertEqual(atlantic_southwest_teams['Atlantic'], ['Boston Celtics', 'Brooklyn Nets', 'New York Knicks', 'Philadelphia 76ers', 'Toronto Raptors'])
        self.assertEqual(atlantic_southwest_teams['Southwest'], ['Dallas Mavericks', 'Houston Rockets', 'Memphis Grizzlies', 'New Orleans Pelicans', 'San Antonio Spurs'])
    def test_get_tpe_711_stores(self):
        tpe_711_stores = get_tpe_711_stores("https://emap.pcsc.com.tw/EMapSDK.aspx")
        self.assertEqual(tpe_711_stores["松山區"][0], {'POIID': '170945', 'POIName': '上弘', 'Longitude': 121.548287390895, 'Latitude': 25.056390968531797, 'Address': '台北市松山區敦化北路168號B2'})
        self.assertEqual(tpe_711_stores["信義區"][0], {'POIID': '167651', 'POIName': '一零一', 'Longitude': 121.565077, 'Latitude': 25.033373, 'Address': '台北市信義區信義路五段7號35樓'})
        self.assertEqual(tpe_711_stores["大安區"][0], {'POIID': '153319', 'POIName': '大台', 'Longitude': 121.53261437826, 'Latitude': 25.0179598345753, 'Address': '台北市大安區羅斯福路三段283巷14弄16號1樓'})
    def test_find_endgame_genre(self):
        self.assertEqual(find_endgame_genre("https://www.imdb.com/title/tt4154796"), ['Action', 'Adventure', 'Drama'])
    def test_find_endgame_cast(self):
        self.assertEqual(find_endgame_cast("https://www.imdb.com/title/tt4154796"), ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly'])
    def test_get_movie_data_from_url(self):
        movie_data = get_movie_data_from_url("https://www.imdb.com/title/tt4154796")
        self.assertEqual(movie_data["moviePoster"], 'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg')
        self.assertEqual(movie_data["movieGenre"], ['Action', 'Adventure', 'Drama'])
        self.assertEqual(movie_data["movieCast"], ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly'])
    def test_get_movie_data_from_title(self):
        movie_data = get_movie_data_from_title("Avengers: Endgame (2019)")
        self.assertEqual(movie_data["moviePoster"], 'https://m.media-amazon.com/images/M/MV5BMTc5MDE2ODcwNV5BMl5BanBnXkFtZTgwMzI2NzQ2NzM@._V1_UX182_CR0,0,182,268_AL_.jpg')
        self.assertEqual(movie_data["movieGenre"], ['Action', 'Adventure', 'Drama'])
        self.assertEqual(movie_data["movieCast"], ['Robert Downey Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Don Cheadle', 'Paul Rudd', 'Benedict Cumberbatch', 'Chadwick Boseman', 'Brie Larson', 'Tom Holland', 'Karen Gillan', 'Zoe Saldana', 'Evangeline Lilly'])
    def test_get_ca_movie_titles(self):
        self.assertIsInstance(get_ca_movie_titles(), list)

suite = unittest.TestLoader().loadTestsFromTestCase(TestWebScraping)
runner = unittest.TextTestRunner(verbosity=2)
test_results = runner.run(suite)

test_find_atlantic_southwest_teams (__main__.TestWebScraping) ... ok
test_find_endgame_cast (__main__.TestWebScraping) ... ok
test_find_endgame_genre (__main__.TestWebScraping) ... ok
test_get_ca_movie_titles (__main__.TestWebScraping) ... ok
test_get_movie_data_from_title (__main__.TestWebScraping) ... ok
test_get_movie_data_from_url (__main__.TestWebScraping) ... ok
test_get_tpe_711_stores (__main__.TestWebScraping) ... ok
test_number_of_nba_teams (__main__.TestWebScraping) ... ok

----------------------------------------------------------------------
Ran 8 tests in 59.364s

OK
