In [13]:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime
import re
from tqdm import tqdm
import sqlite3
import os

DB_NAME = 'ptt_baseball.db'


def setup_database():
    """初始化資料庫和資料表"""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS articles (
            article_id TEXT PRIMARY KEY,
            title TEXT,
            author TEXT,
            post_time TIMESTAMP,
            board TEXT,
            url TEXT UNIQUE
        )
    ''')

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS comments (
            comment_id INTEGER PRIMARY KEY AUTOINCREMENT,
            article_id TEXT,
            push_tag TEXT,
            user_id TEXT,
            content TEXT,
            push_time TIMESTAMP,
            FOREIGN KEY(article_id) REFERENCES articles(article_id)
        )
    ''')

    cursor.execute('''
        CREATE TABLE IF NOT EXISTS metadata (
            key TEXT PRIMARY KEY,
            value TEXT
        )
    ''')

    conn.commit()
    conn.close()
    print(f"資料庫 '{DB_NAME}' 初始化完成。")


def save_progress(cursor, key, value):
    """儲存進度到 metadata 資料表"""
    cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value))


def get_progress(cursor):
    """從資料庫獲取上次的進度"""
    cursor.execute("SELECT value FROM metadata WHERE key = 'last_page_url'")
    last_page_result = cursor.fetchone()
    last_page_url = last_page_result[0] if last_page_result else None

    cursor.execute("SELECT url FROM articles ORDER BY post_time DESC LIMIT 1")
    latest_article_result = cursor.fetchone()
    latest_article_url = latest_article_result[0] if latest_article_result else None

    return last_page_url, latest_article_url


def is_article_exists(cursor, article_url):
    """快速檢查文章是否已存在資料庫"""
    cursor.execute("SELECT 1 FROM articles WHERE url = ? LIMIT 1", (article_url,))
    return cursor.fetchone() is not None


def parse_push_time(datetime_text, article_year):
    """解析留言時間"""
    if not datetime_text:
        return None
    match = re.search(r'(\d{2}/\d{2})\s+(\d{2}:\d{2})', datetime_text)
    if not match:
        return None
    date_part, time_part = match.group(1), match.group(2)
    try:
        return datetime.strptime(f"{article_year}/{date_part} {time_part}", "%Y/%m/%d %H:%M")
    except ValueError:
        return None


def should_skip_article(title):
    """判斷是否要跳過文章（LIVE 或公告）"""
    title_lower = title.lower()

    # 跳過 LIVE 文
    if '[live]' in title_lower:
        return True, "LIVE"

    # 跳過公告（常見格式）
    if any(keyword in title_lower for keyword in ['[公告]', '[協尋]', '[ 公告 ]']):
        return True, "公告"

    # 跳過置底文（通常是公告）
    if title.startswith('□'):
        return True, "置底"

    return False, None


def get_article_data(session, article_url):
    """獲取單篇文章的完整資訊"""
    try:
        response = session.get(article_url, timeout=15)
        response.raise_for_status()
        if '404 - Not Found.' in response.text:
            return None, []
    except requests.exceptions.RequestException:
        return None, []

    soup = BeautifulSoup(response.text, 'html.parser')
    metaline_divs = soup.find_all('div', class_='article-metaline')
    meta = {div.find('span', class_='article-meta-tag').text.strip():
                div.find('span', class_='article-meta-value').text.strip()
            for div in metaline_divs}

    try:
        post_time = datetime.strptime(meta.get('時間', ''), '%a %b %d %H:%M:%S %Y')
        author = meta.get('作者', 'N/A').split(' ')[0]
        title = meta.get('標題', 'N/A')
        board = meta.get('看板', 'N/A')
    except (ValueError, TypeError):
        return None, []

    match = re.search(r'/(M\.\d+\.A\.[A-Z0-9_]+)\.html', article_url)
    article_id = match.group(1) if match else "未知ID"

    article_info = {
        'article_id': article_id,
        'title': title,
        'author': author,
        'post_time': post_time,
        'board': board,
        'url': article_url
    }

    comments = []
    for push in soup.find_all('div', class_='push'):
        user_id_span = push.find('span', 'push-userid')
        content_span = push.find('span', 'push-content')
        if not all([user_id_span, content_span]):
            continue

        push_tag_span = push.find('span', class_='push-tag')
        push_tag = push_tag_span.text.strip() if push_tag_span else '→'

        push_time = None
        ipdatetime_span = push.find('span', 'push-ipdatetime')
        if ipdatetime_span:
            push_time = parse_push_time(ipdatetime_span.text.strip(), post_time.year)

        comments.append({
            'article_id': article_id,
            'push_tag': push_tag,
            'user_id': user_id_span.text.strip(),
            'content': content_span.text.strip(': ').strip(),
            'push_time': push_time
        })

    return article_info, comments


if __name__ == "__main__":
    BASE_URL = 'https://www.ptt.cc'
    BOARD = 'Baseball'
    START_DATE = datetime(2020, 1, 1)

    if not os.path.exists(DB_NAME):
        setup_database()

    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    session = requests.Session()
    session.post(f'{BASE_URL}/ask/over18', data={'yes': 'yes'})

    last_page_url, latest_crawled_url = get_progress(cursor)

    start_url = f'{BASE_URL}/bbs/{BOARD}/index.html'
    if last_page_url:
        print(f"從上次進度繼續：{last_page_url}")
        start_url = last_page_url
    elif latest_crawled_url:
        print(f"偵測到已有資料，將快速跳過已爬取文章")

    current_url = start_url
    stop_scraping = False
    articles_processed = 0
    articles_skipped = 0
    articles_filtered = 0

    print(f"開始爬取 PTT 棒球版，目標日期：{START_DATE.strftime('%Y-%m-%d')} 至今")

    try:
        while not stop_scraping and current_url:
            print(f"\n處理列表頁: {current_url}")
            try:
                response = session.get(current_url, timeout=15)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"無法讀取列表頁，跳過。錯誤: {e}")
                time.sleep(5)
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            save_progress(cursor, 'last_page_url', current_url)
            conn.commit()

            articles_on_page = soup.find_all('div', class_='r-ent')
            new_articles_on_page = 0

            for article_div in tqdm(articles_on_page, desc="掃描文章", leave=False):
                title_link = article_div.find('a')
                if not title_link:
                    continue

                article_title = title_link.text
                article_url = f"{BASE_URL}{title_link['href']}"

                # 快速檢查：文章是否已存在（避免重複爬取）
                if is_article_exists(cursor, article_url):
                    articles_skipped += 1
                    continue

                # 過濾 LIVE 和公告
                should_skip, skip_reason = should_skip_article(article_title)
                if should_skip:
                    articles_filtered += 1
                    tqdm.write(f"跳過 [{skip_reason}]: {article_title[:30]}...")
                    continue

                # 爬取文章
                article_info, comments = get_article_data(session, article_url)
                time.sleep(0.05)

                if article_info is None:
                    continue

                if article_info['post_time'] < START_DATE:
                    stop_scraping = True
                    print(f"\n文章時間 ({article_info['post_time']}) 早於目標日期，停止爬取")
                    break

                # 寫入資料庫
                try:
                    cursor.execute(
                        "INSERT OR IGNORE INTO articles (article_id, title, author, post_time, board, url) VALUES (?, ?, ?, ?, ?, ?)",
                        (article_info['article_id'], article_info['title'], article_info['author'],
                         article_info['post_time'], article_info['board'], article_info['url'])
                    )
                    if comments:
                        cursor.executemany(
                            "INSERT INTO comments (article_id, push_tag, user_id, content, push_time) VALUES (?, ?, ?, ?, ?)",
                            [(c['article_id'], c['push_tag'], c['user_id'], c['content'], c['push_time'])
                             for c in comments]
                        )
                    articles_processed += 1
                    new_articles_on_page += 1
                except sqlite3.Error as e:
                    print(f"資料庫寫入錯誤: {e}")

                # 每 50 篇提交一次
                if articles_processed % 50 == 0:
                    conn.commit()
                    save_progress(cursor, 'last_page_url', current_url)
                    conn.commit()
                    tqdm.write(f"已處理 {articles_processed} 篇新文章 (跳過 {articles_skipped} 篇已爬, 過濾 {articles_filtered} 篇)")

            # 如果整頁都是已爬過的文章，加速翻頁
            if new_articles_on_page == 0 and articles_skipped > 0:
                print(f"本頁全是已爬文章，快速跳過")

            # 翻到上一頁
            prev_page_link = soup.find('a', class_='btn wide', string='‹ 上頁')
            current_url = f"{BASE_URL}{prev_page_link['href']}" if prev_page_link else None

    finally:
        if current_url:
            save_progress(cursor, 'last_page_url', current_url)
        conn.commit()
        conn.close()
        print(f"\n爬取結束")
        print(f"  新增文章：{articles_processed} 篇")
        print(f"  跳過已爬：{articles_skipped} 篇")
        print(f"  過濾文章：{articles_filtered} 篇 (LIVE/公告)")

從上次進度繼續：https://www.ptt.cc/bbs/Baseball/index3198.html
開始爬取 PTT 棒球版，目標日期：2020-01-01 至今

處理列表頁: https://www.ptt.cc/bbs/Baseball/index3198.html


                                                                                                                               


文章時間 (2019-12-31 20:08:38) 早於目標日期，停止爬取

爬取結束
  新增文章：0 篇
  跳過已爬：0 篇
  過濾文章：0 篇 (LIVE/公告)




In [2]:
!nvidia-smi

Thu Oct  2 12:15:01 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090         Off| 00000000:01:00.0 Off |                  Off |
|  0%   36C    P8               14W / 450W|      2MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:          125Gi        24Gi       8.9Gi       1.1Gi        92Gi        99Gi
Swap:         8.0Gi       173Mi       7.8Gi
