In [1]:
#pip install networkx

In [2]:
import requests
from bs4 import BeautifulSoup
import networkx as nx
import json

# specify the base URL and the range of pages you want to scrape
base_url = "https://movies.yahoo.com.tw/movieinfo_main/"
start_page = 1
end_page = 100

# define a Graph object
G = nx.DiGraph()

# loop through each page and scrape the movie information
for page in range(start_page, end_page+1):
    url = base_url + str(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    if not soup.find("div", class_="movie_intro_info"):
        continue
    # add the URL to the Graph
    G.add_node(url)
    # find the movie information on the page
    doc_id = page
    cname = soup.find("div", class_="movie_intro_info").find("h1").text.strip()
    ename = soup.find("div", class_="movie_intro_info_r").find("h3").text.strip()
    label = soup.find("div", class_="level_name").text.strip()
    intro = soup.find("span", id="story").text.strip().replace('\n\n', '')
    released_date = soup.find("div", class_="movie_intro_info_r").find_all("span")[0].text.strip().replace('上映日期：', '')

    # get all the links on the page and add them to the Graph
    links = []
    for link in soup.find_all("a"):
        href = link.get("href")
        text = link.text.strip()
        if href and text:
            links.append(href)
            G.add_edge(url, href)

    # calculate the PageRank for the page
    pagerank = nx.pagerank(G)

    # create a dictionary for the movie information
    movie_info = {
        "doc_id": doc_id,
        "cname": cname,
        "ename": ename,
        "pagerank": pagerank[url], # use the URL as the key to get the PageRank
        "label": label,
        "intro": intro,
        "released_date": released_date,
        "links": links # store only the hrefs of the links
    }

    # append the movie information to the existing JSON file
    with open("movies_data.json", "a", encoding="utf-8") as f:
        json.dump(movie_info, f, ensure_ascii=False)
        f.write("\n") # add a new line at the end of each JSON object to separate them


In [3]:
#pip install jieba

In [4]:
### 中文分詞後，建立 Inverted Index
import json
import jieba
from collections import defaultdict

# load the movie data from the JSON file
with open('movies_data.json', 'r', encoding='utf-8') as f:
    movies = [json.loads(line) for line in f]

# define a tokenizer function using jieba
def tokenize(text):
    return list(jieba.cut(text))

# build the inverted index
inverted_index = defaultdict(list)
for movie in movies:
    # tokenize the movie intro, cname, ename, and label fields
    tokens = tokenize(movie['intro'] + movie['cname'] + movie['ename'] + movie['label'])
    # add each token to the inverted index along with the movie ID
    for token in tokens:
        inverted_index[token].append(movie['doc_id'])

# save the inverted index to a JSON file
with open('inverted.json', 'w', encoding='utf-8') as f:
    json.dump(inverted_index, f, ensure_ascii=False)
    f.write("\n") # add a new line at the end of each JSON object to separate them

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\layla\AppData\Local\Temp\jieba.cache
Loading model cost 1.020 seconds.
Prefix dict has been built successfully.


In [5]:
### 利用 PageRank 演算法來排序
import json

# read in the movie information from the JSON file
movies = []
with open("movies_data.json", "r", encoding="utf-8") as f:
    for line in f:
        movie_info = json.loads(line)
        movies.append(movie_info)

# sort the movies based on their PageRank scores
movies_sorted = sorted(movies, key=lambda x: x["pagerank"], reverse=True)

# print out the top 10 movies
for i, movie in enumerate(movies_sorted[:]):
    print(f"{i+1}. {movie['cname']} ({movie['released_date']}) - PageRank score: {movie['pagerank']:.4f}")

     

1. 一世狂野 (2001-10-12) - PageRank score: 0.0128
2. 玩命關頭 (2001-10-13) - PageRank score: 0.0104
3. 戰雲密佈 (2001-10-13) - PageRank score: 0.0098
4. 騎士風雲錄 (2001-10-19) - PageRank score: 0.0087
5. 金法尤物 (2001-10-19) - PageRank score: 0.0077
6. 瘋狂世界 (2001-11-02) - PageRank score: 0.0070
7. 震撼教育 (2001-10-26) - PageRank score: 0.0063
8. 神鬼第六感 (2001-10-24) - PageRank score: 0.0059
9. 觸不到的戀人 (2001-11-03) - PageRank score: 0.0055
10. 北京樂與路 (2001-11-10) - PageRank score: 0.0052
11. 晚孃 (2001-11-03) - PageRank score: 0.0051
12. 人間有情天 (2001-11-17) - PageRank score: 0.0049
13. 美國派2 (2001-11-10) - PageRank score: 0.0047
14. 我和吸血鬼有份合約 (2001-11-24) - PageRank score: 0.0045
15. 千禧曼波 (2001-11-17) - PageRank score: 0.0043
16. 救世主 (2001-12-07) - PageRank score: 0.0041
17. 鬼計神偷 (2001-10-06) - PageRank score: 0.0039
18. 少林足球 (2001-08-24) - PageRank score: 0.0038
19. 史瑞克 (2001-06-30) - PageRank score: 0.0035
20. 枕邊陷阱 (2001-09-14) - PageRank score: 0.0034
21. 心花怒放 (2001-05-19) - PageRank score: 0.0033
22. 幽靈人間 (2001-

In [6]:
### 輸出搜尋結果呈現
import json
import re

def search_movies(query):
    # load the movie info from the JSON file
    with open("movies_data.json", "r", encoding="utf-8") as f:
        movies = [json.loads(line) for line in f]

    # create a dictionary of {doc_id: PageRank value} pairs
    pageranks = {movie["doc_id"]: movie["pagerank"] for movie in movies}

    # create a list of movies that match the search query
    matches = [movie for movie in movies if any([query.lower() in str(value).lower() for key, value in movie.items()])]

    # sort the matches by their PageRank values
    sorted_matches = sorted(matches, key=lambda x: pageranks.get(x["doc_id"], 0), reverse=True)

    # calculate precision and recall
    relevant_docs = set([movie['doc_id'] for movie in movies if query.lower() in str(movie).lower()])
    retrieved_docs = set([movie['doc_id'] for movie in matches])
    true_positives = relevant_docs.intersection(retrieved_docs)
    precision = len(true_positives) / len(matches) if len(matches) > 0 else 0
    recall = len(true_positives) / len(relevant_docs) if len(relevant_docs) > 0 else 0

    # print the search results and evaluation metrics
    print("您的搜尋結果 (Sorting by PageRank Value)：")
    print(f"共 {len(sorted_matches)} 筆，符合'{query}'  - - - 共 indexing {len(movies)} 筆電影資料")
    for movie in sorted_matches:
        intro = movie.get("intro", "")
        print(f"{movie['doc_id']} ({pageranks.get(movie['doc_id'], 0)}): {movie['cname']} ({movie['ename']}) - {intro}")

    print(f"\nPrecision: {precision:.0%}")
    print(f"Recall: {recall:.0%}")


In [7]:
### 輸入搜尋關鍵字，任何字都可以
search_movies("黑暗")

您的搜尋結果 (Sorting by PageRank Value)：
共 1 筆，符合'黑暗'  - - - 共 indexing 44 筆電影資料
48 (0.0018431517214199573): 魔戒首部曲 (The Lord of the Rings:The Fellowship of the Ring) - 本片是英國鬼才導演彼得傑克森結合最新電影特效的魔幻新作，根據托爾金暢銷全球的經典小說「魔戒三部曲」改編，並以三部曲的方式拍攝，【首部曲─魔戒現身】描述史前世界中，一位名叫佛羅多巴金的年輕人，無意中得到了一只魔戒。這只戒指擁有無窮的神秘力量，戒指原來是黑暗君王索倫所有的，卻意外地到了佛羅多手裏。佛羅多決定將戒指摧毀，以免索倫奪回去鞏固自己的勢力。索倫為了阻止佛羅多，於是派出了手下的怪獸加以追殺，一場正邪大戰眼看著一觸即發…本片背景是在神秘的史前時代，由一場正邪戰役所引發的長篇故事，這個拯救人類的危險任務落在年輕的哈比族人─佛羅多巴金身上，他從表哥巴伯那裏得到了一指無邪的魔幻戒指。佛羅多發現這只戒指的製造者是黑暗魔君索倫，而索倫正急著要把戒指找回去。因為這只戒指是代表偉大邪惡勢力的魔戒，將使索倫統治下的人民得到解放，而他統治的這片土地就是俗稱的中土世界（Middle Earth）。匆忙之下，佛羅多結合了一些救援力量包括了術士、小精靈、侏儒及人類，一起協助他前往中土世界，將戒指丟入魔宮之洞的末日山脈中加以摧毀。然而，這群義勇之士卻遭到索倫手下怪獸群的攻擊，這些恐怖邪惡的獵殺軍隊殘忍地追殺佛羅多和他的朋友。佛羅多等一群人還必須對抗戒指中的邪惡力量，這股力量會讓人產生難以抵擋的欲望，考驗著每一個接觸戒指者的意志力。

Precision: 100%
Recall: 100%
