In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time

In [2]:
def get_webpage_section(soup, tag, attr_type, val): # 濾出需要的區塊
    if attr_type == "id":
        result = soup.find(tag, id=val)
    elif attr_type == "class":
        result = soup.find_all(tag, class_=val) # class 是保留字, bs4 用 class_
    return result

In [3]:
def get_movie_pages(menuUrl): # moviePages: 電影資訊的 List [ChineseName, Url]
    moviePages = list()
    while True:
        r = requests.get(menuUrl)
        soup = BeautifulSoup(r.text, "html.parser")
        movieMenu = get_webpage_section(soup, "div", "class", "release_movie_name")
        for movie in movieMenu:
            # print(movie.select("a"))
            movieChiName = re.sub(r'\s', '', movie.select("a")[0].text)
            # movieEngName = re.sub(r'\s', '', movie.select("a")[1].text) # 進電影資訊頁面再爬
            movieUrl = movie.select("a")[0]['href']
            moviePages.append([movieChiName, movieUrl])
        ### 掃每一頁
        nextPage = get_webpage_section(soup, "li", "class", "nexttxt")
        if nextPage != [] and nextPage[0].find("a") != None: # 若下一頁存在, 更新 menuUrl, loop 繼續抓
            menuUrl = nextPage[0].find("a")['href']
            # print(menuUrl)
        else:
            break
    return moviePages

In [28]:
def get_movie_info(movieInfo): # movieInfo: [ChineseName, Url] # 濾出 分類, 上映時間, 劇情介紹
    r = requests.get(movieInfo[1]) # movieInfo[1]: 電影資訊頁面的網址
    soup = BeautifulSoup(r.text, "html.parser")
    # 英文名稱
    englishNameSec = get_webpage_section(soup, "div", "class", "movie_intro_info_r")
    if englishNameSec != None and englishNameSec != []:
        englishName = englishNameSec[0].find("h3").text # englishName = englishNameSec[0].find("h3").text
    else:
        englishName = 'None'
    movieInfo.append(re.sub(r'\s', '', englishName))
    # 分類
    movieClassSec = get_webpage_section(soup, "div", "class", "level_name_box")
    if movieClassSec != None and movieClassSec != []:
        movieInfo.append([re.sub(r'\s', '', movieCls.text) for movieCls in movieClassSec[0].select("a")])
    else:
        movieInfo.append(['None'])
    # 上映時間
    moviePage = get_webpage_section(soup, "div", "class", "movie_intro_info_r")
    if moviePage != None and moviePage != []:
        movieInfo.append(moviePage[0].find("span").text)
    else:
        movieInfo.append('None')
    # 劇情介紹
    movieStory = get_webpage_section(soup, "span", "id", "story")
    if movieStory != None and movieStory != []:
        movieInfo.append(re.sub(r'\s', '', movieStory.text))
    else:
        movieInfo.append('None')
    
    return movieInfo

In [5]:
def pack_movie_info(movieList): # to json
    movieInfoList = list()
    for movie in movieList: # movie: [Name, Url]
        movieInfo = get_movie_info(movie) 
        movieInfoList.append(movieInfo[:1] + movieInfo[2:]) # 不需要 Url (movieInfo[1])
    movieJson = dict()
    for idx in range(len(movieInfoList)): # movieInfoList: [Name, Name, category, releaseTime, storyIntro]
        mov_dict = dict()
        for jdx in range(len(movieInfoList[idx])):
            if jdx == 0: mov_dict['Chinese Name'] = movieInfoList[idx][jdx]
            elif jdx == 1: mov_dict['English Name'] = movieInfoList[idx][jdx]
            elif jdx == 2: mov_dict['Movie Categories'] = movieInfoList[idx][jdx]
            elif jdx == 3: mov_dict['Release Date'] = movieInfoList[idx][jdx][5:]
            elif jdx == 4: mov_dict['Storyline Intro'] = movieInfoList[idx][jdx]
        movieJson[ str(idx) ] = mov_dict
    # print(movieJson)
    return movieInfoList, movieJson

In [6]:
def get_actors(movieList, ActorSet): # movieList: [[ChineseName, Url], ...] # 濾出 演員 *********
    ActorsUrlList = list()
    for movie in movieList: # movie: [Name, Url]
        if len(ActorSet) >= 2500:
            break
        r = requests.get(movie[1])
        soup = BeautifulSoup(r.text, "html.parser")
        # 演員 Url
        movieActors = get_webpage_section(soup, "ul", "class", "starlist")
        if movieActors != []:
            # ActorsUrlList += [actor['href'] for actor in movieActors[0].find_all("a")]
            for actor in movieActors[0].find_all("a"):
                if actor not in ActorSet:
                    ActorSet.add(actor)
                    ActorsUrlList.append(actor['href'])
    return ActorsUrlList

In [7]:
def get_movie_url_from_actors(actorsUrl, movieSet): # , existMovieSet):
    moviePages = list()
    r = requests.get(actorsUrl)
    soup = BeautifulSoup(r.text, "html.parser")
    movieList = get_webpage_section(soup, "ul", "class", "trailer_list")
    for movie in movieList: 
        # moviePages += [[re.sub(r'\s', '', m.text), m['href']] for m in movie.select("a")]
        for m in movie.select("a"):
            newMovie = [re.sub(r'\s', '', m.text), m['href']]
            if tuple(newMovie) not in movieSet:
                moviePages.append(newMovie)
                movieSet.add(tuple(newMovie))
    return moviePages

In [31]:
def main():
    start = time.time() # Debug 用
    r = requests.get("https://movies.yahoo.com.tw/")
    soup = BeautifulSoup(r.text, "html.parser")
    sel = get_webpage_section(soup, "ul", "id", "mainmenu").select("a")
    mainmenu = dict()
    # 爬下這三個 menu 的網址
    for s in sel:
        for t in ['本週新片', '上映中', '即將上映']: 
            if t in s.text: # 是以上這些種類的, 連結存放到 main menu
                mainmenu[t] = s['href'] 
    movieList = list()
    # 掃 menu 的電影, 爬下電影資訊的網址
    for movie, url in mainmenu.items():
        movieList += get_movie_pages(url)
    movieSet = set(tuple(movie) for movie in movieList)
    # movieInfoList, movieJson = pack_movie_info(movieList) # movieInfoList: Pure List, movieJson: Json for outfile
    # print(movieList)
    ActorSet = set()
    ### 演員其他作品第一層
    movieActorsUrlList = get_actors(movieList, ActorSet)
    movieUrlListFromActors = list()
    print("Start getting movies from Actors...")
    for url in movieActorsUrlList:
        movieUrlListFromActors += get_movie_url_from_actors(url, movieSet) # 從演員取得電影網址 (已有的不加)
        print(str(len(movieUrlListFromActors)) + "...")
    # movieInfoList_1, movieJson_1 = pack_movie_info(movieUrlListFromActors) 
    ### 演員其他作品第二層
    movieActorsUrlList_2 = get_actors(movieUrlListFromActors, ActorSet)
    movieUrlListFromActors_2 = list()
    print("Start getting movies from Actors...")
    for url in movieActorsUrlList_2:
        movieUrlListFromActors_2 += get_movie_url_from_actors(url, movieSet) # 從演員取得電影網址 (已有的不加)
        print(str(len(movieUrlListFromActors_2)) + "...")
    # print(movieList + movieUrlListFromActors + movieUrlListFromActors_2[-1])
    movieInfoList, movieJson = pack_movie_info(movieList + movieUrlListFromActors + movieUrlListFromActors_2)
    
    file = open('movie_info.txt', 'w')
    json.dump(movieJson, file, indent=4, ensure_ascii=False)
    file.close()
    
    end = time.time() # Debug 用
    print("Time: %f 秒" % (end - start))
main()

Start getting movies from Actors...
21...
45...
54...
74...
81...
82...
105...
108...
120...
137...
139...
150...
150...
154...
154...
160...
164...
185...
204...
206...
207...
219...
221...
233...
237...
250...
260...
273...
282...
304...
315...
316...
329...
332...
335...
337...
337...
337...
337...
337...
337...
343...
345...
348...
353...
369...
398...
402...
410...
414...
420...
440...
441...
448...
449...
449...
449...
450...
469...
485...
490...
497...
506...
509...
512...
523...
535...
553...
554...
555...
556...
572...
581...
597...
598...
620...
621...
638...
656...
665...
672...
679...
701...
722...
722...
722...
728...
742...
742...
742...
743...
743...
760...
765...
782...
793...
798...
799...
804...
809...
813...
838...
844...
849...
852...
853...
854...
865...
873...
874...
879...
888...
890...
902...
924...
936...
941...
950...
953...
953...
955...
957...
958...
981...
991...
993...
998...
1003...
1009...
1012...
1014...
1016...
1019...
1029...
1052...
1061...
1064...
1

KeyboardInterrupt: 