<a href="https://colab.research.google.com/github/GummyBear-w/aop113b/blob/main/HW02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HW02 Web Crawler

完整的程式碼、csv檔及更多介紹可以在我的[Github](https://github.com/GummyBear-w/Ebook_Price_Scraper)查看（搭配服用效果更佳）

## 題目

**Kobo電子書價格追蹤**

## 摘要

透過網路爬蟲技術每日自動蒐集Kobo電子書價格資料，並進行視覺化分析，建立一套價格波動追蹤系統。


## 引言

我本身有在Kobo平台上購買電子書的習慣，希望能夠透過爬蟲技術幫助自己更方便地追蹤感興趣的書籍價格，收集資料來了解價格波動趨勢、歷史低價、折扣週期，以便找到最優惠的購買價格。

## 方法

- 目標網站描述
  - 目標網站：[Kobo.com台灣商店](https://www.kobo.com/tw)
  
  - 頁面結構：包含書籍名稱、圖片、價格等資訊。

- 工具與技術
  - 語言與函式庫：Python（Selenium、Plotly、Pandas、CSV）

  - 自動化排程與部署：GitHub Actions、crontab 時間排程、Token 權限設定

  - 資料儲存：CSV

  - 視覺化頁面生成：Bootstrap + HTML + Plotly 圖表 iframe 內嵌

In [None]:
# 爬取資料程式碼
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import os
import random
from datetime import datetime

def random_sleep(a=1.5, b=3.0):
    time.sleep(random.uniform(a, b))

def get_all_book_links(driver, wait, base_url):
    driver.get(base_url)
    book_links = []
    page_num = 1
    max_pages = 10  # 最多處理 10 頁，以免自動排程執行後出現bug時不停訪問網站

    current_url = driver.current_url
    while page_num <= max_pages:
        print(f"正在處理第 {page_num} 頁...")
        random_sleep(2, 4)

        books = driver.find_elements(By.CSS_SELECTOR, "div.book-card h2 a.cdk-link")
        for book in books:
            try:
                href = book.get_attribute("href")
                if href and href not in book_links:
                    book_links.append(href)
            except StaleElementReferenceException:
                continue
        # 透過網址變動確認是否到達最末頁
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'button.control-button[aria-label="next page"]')
            if next_button.get_attribute("aria-disabled") == "true" or "disabled" in next_button.get_attribute("class"):
                print("✅ 已到最後一頁")
                break
            else:
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                random_sleep(0.8, 1.5)
                driver.execute_script("arguments[0].click();", next_button)
                random_sleep(2, 4)

                # 確認 URL 是否變動
                new_url = driver.current_url
                if new_url == current_url:
                    print("✅ 網址沒有變化，應是最後一頁")
                    break
                current_url = new_url

                page_num += 1
        except NoSuchElementException:
            print("❌ 找不到下一頁按鈕，結束")
            break

    return book_links

# 抓取內頁詳細資訊，包括書名、價位、圖片、isbn碼
def extract_book_info(driver, wait, url):
    driver.get(url)
    try:
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.item-info h1.title"))).text
    except:
        title = "無法取得書名"

    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pricing-figures")))
        random_sleep(1, 2)
        price_elem = driver.find_element(By.CSS_SELECTOR, "div.pricing-figures span.price")
        price_text = driver.execute_script("return arguments[0].textContent;", price_elem).strip()
        price = price_text.replace("NT$", "").replace(",", "").strip()
    except:
        price = "無法取得價格"

    try:
        isbn = "無法取得"
        lis = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.bookitem-secondary-metadata li")))
        for li in lis:
            if "書籍ID：" in li.text:
                isbn = li.text.replace("書籍ID：", "").strip()
                break
    except:
        isbn = "無法取得 ISBN"

    try:
        image_url = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.item-image img.cover-image"))).get_attribute("src")
    except:
        image_url = "無法取得圖片"

    return {
        "書名": title,
        "價格": price,
        "ISBN": isbn,
        "封面照片": image_url,
        "連結": url
    }

# 主流程
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 10)

authors = {
    "卡繆": "https://www.kobo.com/tw/zh/search?query=%E5%8D%A1%E7%B9%86&ac=1&acp=%E5%8D%A1%E7%B9%86&ac.author=%E5%8D%A1%E7%B9%86&acpos=a2&uir=true&fclanguages=zh&fcsearchfield=author",
    "簡媜": "https://www.kobo.com/tw/zh/search?query=%E7%B0%A1%E5%AA%9C&ac=1&ac.morein=true&ac.author=%E7%B0%A1%E5%AA%9C&fcsearchfield=author&fclanguages=zh",
    "赫曼．赫塞": "https://www.kobo.com/tw/zh/search?query=%E8%B5%AB%E6%9B%BC%EF%BC%8E%E8%B5%AB%E5%A1%9E+&ac=1&ac.morein=true&ac.author=%E8%B5%AB%E6%9B%BC%EF%BC%8E%E8%B5%AB%E5%A1%9E+&fcsearchfield=author&fclanguages=zh"
}

csv_file = "book_prices.csv"
csv_headers = ["日期", "作者", "書名", "價格", "ISBN", "封面照片", "連結"]

if not os.path.exists(csv_file):
    with open(csv_file, mode='w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=csv_headers)
        writer.writeheader()

today = datetime.now().strftime("%Y-%m-%d")

for author, url in authors.items():
    print(f"\n=== 處理作者：{author} ===")
    book_links = get_all_book_links(driver, wait, url)
    print(f"共取得 {len(book_links)} 筆連結")

    for idx, link in enumerate(book_links, 1):
        print(f"→ 第 {idx} 本書：{link}")
        info = extract_book_info(driver, wait, link)
        info["日期"] = today
        info["作者"] = author

        with open(csv_file, mode='a', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=csv_headers)
            writer.writerow(info)

print("\n✅ 資料已儲存至 book_prices.csv")
driver.quit()

## 結果


- 每日自動爬取資料
  - 每日透過 GitHub Actions 自動執行爬蟲程式，長期累積以利後續追蹤變動趨勢
- 資料存儲方式
  - 將資料存儲為 CSV 格式，方便後續分析
- 簡易的視覺化呈現
  - 利用 Plotly 每日更新各書的價格折線圖（以ISBN區分）
  - 利用 GitHub Pages 自動部署成[公開網頁](https://gummybear-w.github.io/Ebook_Price_Scraper/)

In [None]:
# 展示資料程式碼
import pandas as pd
import plotly.express as px
import os

# 讀取資料
csv_file = "book_prices.csv"
df = pd.read_csv(csv_file)
df["日期"] = pd.to_datetime(df["日期"])
df["價格"] = pd.to_numeric(df["價格"], errors="coerce")
df = df.dropna(subset=["價格", "ISBN"])

# 確保 docs 資料夾存在
os.makedirs("docs", exist_ok=True)

# 各本書生成價格折線圖
isbn_to_plot_path = {}
for isbn, group in df.groupby("ISBN"):
    if group.shape[0] < 2:
        continue  # 只有一天資料不用畫圖
    fig = px.line(
        group,
        x="日期",
        y="價格"
    )
    fig.update_layout(
        margin=dict(t=40),  # 避免圖表被擠到
   )

    fig.update_layout(xaxis_tickformat="%Y-%m-%d")

    fig.update_traces(hovertemplate="日期：%{x}<br>價格：NT$%{y}<extra></extra>", mode="lines+markers")

    plot_path = f"plot_{isbn}.html"
    fig.write_html(f"docs/{plot_path}", include_plotlyjs="cdn", full_html=False)
    isbn_to_plot_path[isbn] = plot_path

# 得到最新日期以用於每本書最新價格
latest_date = df["日期"].max()
latest_df = df[df["日期"] == latest_date]

# 各 ISBN 最低價格
min_price = df.groupby("ISBN")["價格"].min()

# 以作者分群
authors = ["全部作者"] + sorted(df["作者"].unique())

# 生成 index.html
with open("docs/index.html", "w", encoding="utf-8") as f:
    f.write("""
<!DOCTYPE html>
<html lang="zh">
<head>
    <meta charset="UTF-8">
    <title>電子書價格追蹤</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
    <style>
        a.card-title-link {
            text-decoration: none;
            color: black;
        }
        a.card-title-link:hover {
            color: #555;
        }
    </style>

</head>
<body class="bg-light">
<div class="container py-4">
    <h1 class="mb-4">電子書價格追蹤</h1>
    <div class="mb-4">
        <label for="authorSelect" class="form-label">下拉式選單 選擇作者：</label>
        <select class="form-select" id="authorSelect" onchange="filterByAuthor()">
""")
    for author in authors:
        value = author if author != "全部作者" else "all"
        f.write(f'<option value="{value}">{author}</option>\n')
    f.write("""
        </select>
    </div>

    <div class="mb-4">
        <label for="searchInput" class="form-label">搜尋書名：</label>
        <input type="text" class="form-control" id="searchInput" oninput="filterBooks()" placeholder="輸入書名關鍵字">
    </div>

    <div id="bookCards">
""")
    for _, row in latest_df.iterrows():
        isbn = row["ISBN"]
        image = row["封面照片"]
        title = row["書名"]
        price = row["價格"]
        link = row["連結"]
        author = row["作者"]
        min_p = min_price.get(isbn, price)
        chart_html = f'<iframe src="{isbn_to_plot_path.get(isbn, "")}" width="100%" height="300"></iframe>' if isbn in isbn_to_plot_path else '<p class="text-muted">目前無歷史價格資料</p>'
        f.write(f"""
<div class="card mb-4" data-author="{author}">
  <div class="row g-0">
    <div class="col-md-3 d-flex align-items-center justify-content-center">
      <img src="{image}" class="img-fluid rounded-start" alt="封面" style="height: 200px; object-fit: contain;">
    </div>
    <div class="col-md-9">
      <div class="card-body">
        <h5 class="card-title"><a href="{link}" target="_blank" class="card-title-link">{title}</a></h5>
        <p class="card-text">本日價格：NT${price}　歷史低價：NT${min_p}</p>
        {chart_html}
      </div>
    </div>
  </div>
</div>
""")
    f.write("""
    </div>
</div>
<script>
function filterBooks() {
    const selectedAuthor = document.getElementById("authorSelect").value;
    const keyword = document.getElementById("searchInput").value.toLowerCase();

    document.querySelectorAll("#bookCards .card").forEach(card => {
        const author = card.dataset.author;
        const title = card.querySelector(".card-title").innerText.toLowerCase();
        const matchAuthor = (selectedAuthor === "all" || author === selectedAuthor);
        const matchTitle = title.includes(keyword);
        card.style.display = (matchAuthor && matchTitle) ? "" : "none";
    });
}
document.getElementById("authorSelect").addEventListener("change", filterBooks);
</script>

</body>
</html>
""")

#### 網頁呈現的效果如圖，利用***折線圖***可以輕鬆看出價格波動，也有簡單的篩選和搜尋功能。

<img src="https://github.com/GummyBear-w/Ebook_Price_Scraper/raw/main/demo.gif" width="85%">



## 問題與挑戰

- 技術挑戰  

    **動態內容載入：**

    Kobo 的書籍搜尋與詳細頁面內容多透過 JavaScript 動態產生，像是價格與封面圖片都在頁面載入後才被插入 DOM，無法使用 **requests** 等靜態爬蟲工具直接抓取，改為採用 **Selenium** 來模擬真實使用者行為，能正確載入並操作網頁內容，透過等待元素出現、模擬分頁點擊與延遲操作等方式，繞過限制並穩定取得所需資料。
- 資料限制  
    爬取的書本數量較少

## 結論


- 成功獲取感興趣的書籍價格資料，並自動化長期追蹤及視覺化呈現。
- 未來可擴展至其他電子書平台，進行跨平台比較分析。

## 參考文獻

* [Python 進階爬蟲](https://hackmd.io/@AndyChiang/DynamicCrawler)
* [動態網頁爬蟲-使用Selenium](https://hackmd.io/@aaronlife/python-topic-selenium)
* [【資料分析】Python爬蟲入門實作（下）](https://medium.com/@kaojia/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-python%E7%88%AC%E8%9F%B2%E5%85%A5%E9%96%80%E5%AF%A6%E4%BD%9C-%E4%B8%8B-%E5%8B%95%E6%85%8B%E7%B6%B2%E9%A0%81%E7%88%AC%E8%9F%B2-%E5%8F%8D%E5%8F%8D%E7%88%AC%E8%9F%B2-json-%E6%A0%BC%E5%BC%8F-2170c88b0ec8)
* [樂天 Kobo Taiwan](https://kobo.com/tw)