In [28]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [37]:
def get_webpage_section(soup, tag, attr_type, val): # 濾出需要的區塊
    if attr_type == "id":
        result = soup.find(tag, id=val)
    elif attr_type == "class":
        result = soup.find_all(tag, class_=val) # class 是保留字, bs4 用 class_
    return result

In [95]:
def get_movie_pages(menuUrl): # moviePages: 電影資訊的 List [ChineseName, EnglishName, Url]
    moviePages = list()
    while True:
        r = requests.get(menuUrl)
        soup = BeautifulSoup(r.text, "html.parser")
        movieMenu = get_webpage_section(soup, "div", "class", "release_movie_name")
        for movie in movieMenu:
            # print(movie.select("a"))
            movieChiName = re.sub(r'\s', '', movie.select("a")[0].text)
            movieEngName = re.sub(r'\s', '', movie.select("a")[1].text)
            movieUrl = movie.select("a")[0]['href']
            moviePages.append([movieChiName, movieEngName, movieUrl])
        ### 掃每一頁
        nextPage = get_webpage_section(soup, "li", "class", "nexttxt")
        if nextPage != [] and nextPage[0].find("a") != None: # 若下一頁存在, 更新 menuUrl, loop 繼續抓
            menuUrl = nextPage[0].find("a")['href']
            # print(menuUrl)
        else:
            break
    return moviePages

In [97]:
def get_movie_info(movieInfo): # movieInfo: [ChineseName, EnglishName, Url] # 濾出 分類, 上映時間, 劇情介紹
    r = requests.get(movieInfo[2]) # movieInfo[2]: 電影資訊頁面的網址
    soup = BeautifulSoup(r.text, "html.parser")
    # 分類
    movieClassSec = get_webpage_section(soup, "div", "class", "level_name_box")
    movieInfo.append([re.sub(r'\s', '', movieCls.text) for movieCls in movieClassSec[0].select("a")])
    # 上映時間
    moviePage = get_webpage_section(soup, "div", "class", "movie_intro_info_r")
    movieInfo.append(moviePage[0].find("span").text)
    # 劇情介紹
    movieStory = get_webpage_section(soup, "span", "id", "story")
    movieInfo.append(re.sub(r'\s', '', movieStory.text))
    
    # 演員 Url
    movieActors = get_webpage_section(soup, "ul", "class", "starlist")
    print(movieActors)
    print(movieActors[0].find("a"))
    print(movieActors[0].find("a")['href'])
    input()
    return movieInfo

In [45]:
def pack_movie_info(movieList): # to json
    movieInfoList = list()
    for movie in movieList: # movie: [Name, Name, Url]
        movieInfo = get_movie_info(movie) 
        movieInfoList.append(movieInfo[:2] + movieInfo[3:]) # 不需要 Url (movieInfo[2])
    movieJson = dict()
    for idx in range(len(movieInfoList)):
        mov_dict = dict()
        for jdx in range(len(movieInfoList[idx])):
            if jdx == 0: mov_dict['Chinese Name'] = movieInfoList[idx][jdx]
            elif jdx == 1: mov_dict['English Name'] = movieInfoList[idx][jdx]
            elif jdx == 2: mov_dict['Movie Categories'] = movieInfoList[idx][jdx]
            elif jdx == 3: mov_dict['Release Date'] = movieInfoList[idx][jdx][5:]
            elif jdx == 4: mov_dict['Storyline Intro'] = movieInfoList[idx][jdx]
        movieJson[ str(idx) ] = mov_dict
    # print(movieJson)
    return movieInfoList, movieJson

In [96]:
def main():
    file = open('movie_info.txt', 'w')
    r = requests.get("https://movies.yahoo.com.tw/")
    soup = BeautifulSoup(r.text, "html.parser")
    sel = get_webpage_section(soup, "ul", "id", "mainmenu").select("a")
    mainmenu = dict()
    # 爬下這三個 menu 的網址
    for s in sel:
        for t in ['本週新片', '上映中', '即將上映']: 
            if t in s.text: # 是以上這些種類的, 連結存放到 main menu
                mainmenu[t] = s['href'] 
    movieList = list()
    # 掃 menu 的電影, 爬下電影資訊的網址
    for movie, url in mainmenu.items():
        movieList += get_movie_pages(url)
    # 從電影網址
    movieInfoList, movieJson = pack_movie_info(movieList) # movieInfoList: Pure List, movieJson: Json for outfile
    # print(movieInfoList)
    
    json.dump(movieJson, file, indent=4, ensure_ascii=False)
    file.close()
main()

http://movies.yahoo.com.tw/movie_intheaters.html?page=2
http://movies.yahoo.com.tw/movie_intheaters.html?page=3
http://movies.yahoo.com.tw/movie_intheaters.html?page=4
http://movies.yahoo.com.tw/movie_intheaters.html?page=5
http://movies.yahoo.com.tw/movie_intheaters.html?page=6
http://movies.yahoo.com.tw/movie_intheaters.html?page=7
http://movies.yahoo.com.tw/movie_comingsoon.html?page=2
http://movies.yahoo.com.tw/movie_comingsoon.html?page=3
http://movies.yahoo.com.tw/movie_comingsoon.html?page=4
http://movies.yahoo.com.tw/movie_comingsoon.html?page=5
http://movies.yahoo.com.tw/movie_comingsoon.html?page=6


In [18]:
file = open("movie_info.txt", "r")
print(file.read())
file.close()




In [49]:
li = []
if li == []:
    print(True)
else:
    print(False)

True
