In [9]:

from MySqlHelper import MySqlHelper
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import logging
from apscheduler.schedulers.blocking import BlockingScheduler
# 日志配置
logging.basicConfig(
    filename='crawler.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s'
)

def init_db(helper: MySqlHelper):
    """创建热搜表"""
    sql = """
    CREATE TABLE IF NOT EXISTS hot_search (
    id INT AUTO_INCREMENT PRIMARY KEY,
    ranking INT NOT NULL,
    title VARCHAR(100) NOT NULL,
    url VARCHAR(200) NOT NULL,
    ts DATETIME NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
    """
    helper.execute(sql)

def fetch_hot_search():
    """爬取百度热搜 Top10"""
    url = 'https://top.baidu.com/board?tab=realtime'
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'lxml')
    items = soup.select('.c-single-text-ellipsis')[:10]

    data = []
    for rank, tag in enumerate(items, start=1):
        title = tag.get_text(strip=True)
        link = tag.parent.get('href')
        # 如果是相对路径，需要补全
        if link and link.startswith('/'):
            link = 'https://top.baidu.com' + link
        ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        data.append((rank, title, link, ts))
    return data

def job():
    """一次完整抓取并存库流程"""
    helper = MySqlHelper(
        host='localhost',
        user='root',
        password='@Kevin021117',
        database='school'
    )
    try:
        helper.connect()
        init_db(helper)
        records = fetch_hot_search()
        for rec in records:
            helper.execute(
                "INSERT INTO hot_search(ranking, title, url, ts) VALUES (%s, %s, %s, %s)",
                rec
            )
        logging.info(f"成功插入 {len(records)} 条热搜数据")
    except Exception as e:
        logging.error(f"爬虫任务出错：{e}")
    finally:
        helper.close()



def get_all_hot_search():
    """从数据库里拿出所有热搜，按时间和排名排序"""
    helper = MySqlHelper(
        host='localhost',
        user='root',
        password='@Kevin021117',
        database='school'
    )
    helper.connect()
    sql = """
    SELECT ranking, title, url, ts
    FROM hot_search
    ORDER BY ts DESC, ranking ASC;
    """
    rows = helper.query(sql)
    helper.close()
    return rows

def print_data(rows):
    if not rows:
        print("no data found.")
        return
    print(f"\nFetched {len(rows)} records:")
    print("-" * 80)
    print(f"{'Time':<20}{'Rank':<6}{'Title':<40}")
    print("-" * 80)
    for r in rows:
        ts = r['ts'].strftime('%Y-%m-%d %H:%M:%S') if isinstance(r['ts'], datetime) else r['ts']
        print(f"{ts:<20}{r['ranking']:<6}{r['title']:<40}")
        if r.get('url'):
            print(f" URL: {r['url']}")
    print("-" * 80)

if __name__ == '__main__':
    # 1) 先跑一遍爬虫并存库
    job()

    # 2) 再从库里读出来并打印
    data = get_all_hot_search()
    print_data(data)



Fetched 70 records:
--------------------------------------------------------------------------------
Time                Rank  Title                                   
--------------------------------------------------------------------------------
2025-06-11 23:01:41 1     总书记的“家园”之喻                              
 URL: https://www.baidu.com/s?wd=%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%84%E2%80%9C%E5%AE%B6%E5%9B%AD%E2%80%9D%E4%B9%8B%E5%96%BB&sa=fyb_news&rsv_dl=fyb_news
2025-06-11 23:01:41 2     洛杉矶已面目全非                                
 URL: https://www.baidu.com/s?wd=%E6%B4%9B%E6%9D%89%E7%9F%B6%E5%B7%B2%E9%9D%A2%E7%9B%AE%E5%85%A8%E9%9D%9E&sa=fyb_news&rsv_dl=fyb_news
2025-06-11 23:01:41 3     高考结束挑行李回家的女生发声                          
 URL: https://www.baidu.com/s?wd=%E9%AB%98%E8%80%83%E7%BB%93%E6%9D%9F%E6%8C%91%E8%A1%8C%E6%9D%8E%E5%9B%9E%E5%AE%B6%E7%9A%84%E5%A5%B3%E7%94%9F%E5%8F%91%E5%A3%B0&sa=fyb_news&rsv_dl=fyb_news
2025-06-11 23:01:41 4     听听青年的网络文明“关键词”                          
 URL: ht