In [1]:
import requests
from bs4 import BeautifulSoup
import time


In [2]:

BASE_URL = "https://ysjt.ustc.edu.cn"
START_PAGE = 1
MAX_PAGE = 393  # 当前最大页数（可自动探测也可以写死）
OUTPUT_FILE = "vegetable_price_links.txt"

headers = {
    "User-Agent": "Mozilla/5.0"
}

def fetch_page(page_num):
    if page_num == 1:
        url = f"{BASE_URL}/wjxx/list.htm"
    else:
        url = f"{BASE_URL}/wjxx/list{page_num}.htm"
    print(f"Fetching: {url}")
    res = requests.get(url, headers=headers, timeout=10)
    res.encoding = res.apparent_encoding
    return res.text

def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    items = []
    for li in soup.find_all("li", class_="cle"):
        title_tag = li.find("a")
        date_tag = li.find("div", class_="time fr")
        if title_tag and date_tag:
            title = title_tag.get_text(strip=True)
            if "蔬菜价格信息" in title:
                relative_link = title_tag['href']
                full_link = BASE_URL + relative_link
                date = date_tag.get_text(strip=True)
                items.append((title, date, full_link))
    return items

def crawl_all_pages():
    all_items = []
    for page_num in range(1, MAX_PAGE + 1):
        try:
            html = fetch_page(page_num)
            items = parse_page(html)
            all_items.extend(items)
            time.sleep(0.5)  # 友好间隔
        except Exception as e:
            print(f"Error on page {page_num}: {e}")
            continue
    return all_items

def save_to_file(items, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for title, date, link in items:
            f.write(f"{title} | {date} | {link}\n")
    print(f"Saved {len(items)} items to {filename}")



In [3]:
all_items = crawl_all_pages()
save_to_file(all_items, OUTPUT_FILE)

Fetching: https://ysjt.ustc.edu.cn/wjxx/list.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list2.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list3.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list4.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list5.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list6.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list7.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list8.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list9.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list10.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list11.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list12.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list13.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list14.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list15.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list16.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list17.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list18.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list19.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/l

In [31]:
import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm import tqdm

def load_links(txt_path, max_count=None):
    links = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('|')
            if len(parts) == 3:
                title = parts[0].strip()
                url = parts[2].strip()
                match = re.search(r'第(\d+)期', title)
                if match:
                    period = match.group(1)
                    links.append((period, url))
    # 按照期号降序排序，默认从最新的爬
    links.sort(key=lambda x: int(x[0]), reverse=True)
    return links[:max_count] if max_count else links

def extract_date_and_prices_from_page(html):
    soup = BeautifulSoup(html, 'html.parser')

    # 直接提取整页文本
    content = soup.get_text(separator='', strip=True)

    # 提取日期（格式：XXXX年X月X日）
    date_match = re.search(r'(\d{4}年\d{1,2}月\d{1,2}日)', content)
    date_str = date_match.group(1) if date_match else None

    # 清洗后提取价格信息
    content_cleaned = re.sub(r'蔬菜价格信息（第\d+期）\s*' + (date_str or ''), '', content)
    content_cleaned = re.sub(r'价格单位[：:]\s*元/斤', '', content_cleaned)
    content_cleaned = re.sub(r'(名称)?单价', '', content_cleaned)

    pattern = re.findall(r'([\u4e00-\u9fa5A-Za-z（）()]{1,10}?)(\d+(?:\.\d+)?)', content_cleaned)

    price_dict = {}
    for name, price in pattern:
        name = name.strip()
        if name and len(name) <= 7:
            try:
                price_dict[name] = float(price)
            except:
                continue

    return date_str, price_dict


def crawl_from_file(txt_path, max_count=None):
    links = load_links(txt_path, max_count)
    all_data = {}
    for period, url in tqdm(links, desc="📦 正在爬取"):
        try:
            resp = requests.get(url, timeout=10)
            resp.encoding = 'utf-8'
            prices = extract_date_and_prices_from_page(resp.text)
            all_data[period] = prices
        except Exception as e:
            print(f"⚠️ 错误: 第{period}期 - {e}")
    return all_data

# ✨ 运行示例
if __name__ == "__main__":
    max_issues = None  # ✅ 修改这里控制期数，比如 10, 50, None（全部）
    result = crawl_from_file("vegetable_price_links.txt", max_count=max_issues)
    with open("vegetable_prices.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"✅ 共保存 {len(result)} 期数据至 vegetable_prices.json")


📦 正在爬取:   0%|          | 0/4725 [00:00<?, ?it/s]

📦 正在爬取:  16%|█▌        | 744/4725 [01:46<41:07,  1.61it/s]  

⚠️ 错误: 第6326期 - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2023/0222/c18038a592842/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


📦 正在爬取:  20%|██        | 961/4725 [02:31<1:21:05,  1.29s/it]

⚠️ 错误: 第6086期 - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Read timed out. (read timeout=10)


📦 正在爬取:  35%|███▍      | 1643/4725 [03:46<31:21,  1.64it/s] 

⚠️ 错误: 第5296期 - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2020/0110/c18038a411560/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


📦 正在爬取:  45%|████▌     | 2146/4725 [05:01<04:36,  9.34it/s]

⚠️ 错误: 第4666期 - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2018/0531/c18038a383510/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


📦 正在爬取: 100%|██████████| 4725/4725 [10:20<00:00,  7.62it/s]


✅ 共保存 4629 期数据至 vegetable_prices.json


In [32]:
import json
import pandas as pd

# 读取 JSON 文件
with open('vegetable_prices.json', 'r', encoding='utf-8') as file:
    vegetable_data = json.load(file)

# 用来存储整理后的数据
data = []

# 遍历每一期的蔬菜价格数据
for period, details in vegetable_data.items():
    date = details[0]  # 日期是列表的第一个元素
    prices = details[1]  # 蔬菜价格信息是列表的第二个元素
    
    for vegetable, price in prices.items():
        # 将数据添加到列表中
        data.append({
            'period': period,
            'vegetable': vegetable,
            'price': price,
            'date': date  # 添加日期信息
        })

# 将整理后的数据转换为 DataFrame
df = pd.DataFrame(data)

# 显示前几行数据以验证结果
print(df.head())

# 将整理后的数据保存为 CSV 文件
df.to_csv('vegetable_prices.csv', index=False)


  period vegetable  price        date
0   7149        韭菜    1.8  2025年3月31日
1   7149       四月蔓    1.0  2025年3月31日
2   7149        瓠子    2.3  2025年3月31日
3   7149      铁棍山药    3.5  2025年3月31日
4   7149       水芹菜    3.5  2025年3月31日
