In [1]:
import requests
from bs4 import BeautifulSoup
import time


In [2]:

BASE_URL = "https://ysjt.ustc.edu.cn"
START_PAGE = 1
MAX_PAGE = 393  # ÂΩìÂâçÊúÄÂ§ßÈ°µÊï∞ÔºàÂèØËá™Âä®Êé¢Êµã‰πüÂèØ‰ª•ÂÜôÊ≠ªÔºâ
OUTPUT_FILE = "vegetable_price_links.txt"

headers = {
    "User-Agent": "Mozilla/5.0"
}

def fetch_page(page_num):
    if page_num == 1:
        url = f"{BASE_URL}/wjxx/list.htm"
    else:
        url = f"{BASE_URL}/wjxx/list{page_num}.htm"
    print(f"Fetching: {url}")
    res = requests.get(url, headers=headers, timeout=10)
    res.encoding = res.apparent_encoding
    return res.text

def parse_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    items = []
    for li in soup.find_all("li", class_="cle"):
        title_tag = li.find("a")
        date_tag = li.find("div", class_="time fr")
        if title_tag and date_tag:
            title = title_tag.get_text(strip=True)
            if "Ëî¨Ëèú‰ª∑Ê†º‰ø°ÊÅØ" in title:
                relative_link = title_tag['href']
                full_link = BASE_URL + relative_link
                date = date_tag.get_text(strip=True)
                items.append((title, date, full_link))
    return items

def crawl_all_pages():
    all_items = []
    for page_num in range(1, MAX_PAGE + 1):
        try:
            html = fetch_page(page_num)
            items = parse_page(html)
            all_items.extend(items)
            time.sleep(0.5)  # ÂèãÂ•ΩÈó¥Èöî
        except Exception as e:
            print(f"Error on page {page_num}: {e}")
            continue
    return all_items

def save_to_file(items, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for title, date, link in items:
            f.write(f"{title} | {date} | {link}\n")
    print(f"Saved {len(items)} items to {filename}")



In [3]:
all_items = crawl_all_pages()
save_to_file(all_items, OUTPUT_FILE)

Fetching: https://ysjt.ustc.edu.cn/wjxx/list.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list2.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list3.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list4.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list5.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list6.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list7.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list8.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list9.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list10.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list11.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list12.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list13.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list14.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list15.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list16.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list17.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list18.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/list19.htm
Fetching: https://ysjt.ustc.edu.cn/wjxx/l

In [31]:
import requests
import re
import json
from bs4 import BeautifulSoup
from tqdm import tqdm

def load_links(txt_path, max_count=None):
    links = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('|')
            if len(parts) == 3:
                title = parts[0].strip()
                url = parts[2].strip()
                match = re.search(r'Á¨¨(\d+)Êúü', title)
                if match:
                    period = match.group(1)
                    links.append((period, url))
    # ÊåâÁÖßÊúüÂè∑ÈôçÂ∫èÊéíÂ∫èÔºåÈªòËÆ§‰ªéÊúÄÊñ∞ÁöÑÁà¨
    links.sort(key=lambda x: int(x[0]), reverse=True)
    return links[:max_count] if max_count else links

def extract_date_and_prices_from_page(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Áõ¥Êé•ÊèêÂèñÊï¥È°µÊñáÊú¨
    content = soup.get_text(separator='', strip=True)

    # ÊèêÂèñÊó•ÊúüÔºàÊ†ºÂºèÔºöXXXXÂπ¥XÊúàXÊó•Ôºâ
    date_match = re.search(r'(\d{4}Âπ¥\d{1,2}Êúà\d{1,2}Êó•)', content)
    date_str = date_match.group(1) if date_match else None

    # Ê∏ÖÊ¥óÂêéÊèêÂèñ‰ª∑Ê†º‰ø°ÊÅØ
    content_cleaned = re.sub(r'Ëî¨Ëèú‰ª∑Ê†º‰ø°ÊÅØÔºàÁ¨¨\d+ÊúüÔºâ\s*' + (date_str or ''), '', content)
    content_cleaned = re.sub(r'‰ª∑Ê†ºÂçï‰Ωç[Ôºö:]\s*ÂÖÉ/Êñ§', '', content_cleaned)
    content_cleaned = re.sub(r'(ÂêçÁß∞)?Âçï‰ª∑', '', content_cleaned)

    pattern = re.findall(r'([\u4e00-\u9fa5A-Za-zÔºàÔºâ()]{1,10}?)(\d+(?:\.\d+)?)', content_cleaned)

    price_dict = {}
    for name, price in pattern:
        name = name.strip()
        if name and len(name) <= 7:
            try:
                price_dict[name] = float(price)
            except:
                continue

    return date_str, price_dict


def crawl_from_file(txt_path, max_count=None):
    links = load_links(txt_path, max_count)
    all_data = {}
    for period, url in tqdm(links, desc="üì¶ Ê≠£Âú®Áà¨Âèñ"):
        try:
            resp = requests.get(url, timeout=10)
            resp.encoding = 'utf-8'
            prices = extract_date_and_prices_from_page(resp.text)
            all_data[period] = prices
        except Exception as e:
            print(f"‚ö†Ô∏è ÈîôËØØ: Á¨¨{period}Êúü - {e}")
    return all_data

# ‚ú® ËøêË°åÁ§∫‰æã
if __name__ == "__main__":
    max_issues = None  # ‚úÖ ‰øÆÊîπËøôÈáåÊéßÂà∂ÊúüÊï∞ÔºåÊØîÂ¶Ç 10, 50, NoneÔºàÂÖ®ÈÉ®Ôºâ
    result = crawl_from_file("vegetable_price_links.txt", max_count=max_issues)
    with open("vegetable_prices.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"‚úÖ ÂÖ±‰øùÂ≠ò {len(result)} ÊúüÊï∞ÊçÆËá≥ vegetable_prices.json")


üì¶ Ê≠£Âú®Áà¨Âèñ:   0%|          | 0/4725 [00:00<?, ?it/s]

üì¶ Ê≠£Âú®Áà¨Âèñ:  16%|‚ñà‚ñå        | 744/4725 [01:46<41:07,  1.61it/s]  

‚ö†Ô∏è ÈîôËØØ: Á¨¨6326Êúü - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2023/0222/c18038a592842/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


üì¶ Ê≠£Âú®Áà¨Âèñ:  20%|‚ñà‚ñà        | 961/4725 [02:31<1:21:05,  1.29s/it]

‚ö†Ô∏è ÈîôËØØ: Á¨¨6086Êúü - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Read timed out. (read timeout=10)


üì¶ Ê≠£Âú®Áà¨Âèñ:  35%|‚ñà‚ñà‚ñà‚ñç      | 1643/4725 [03:46<31:21,  1.64it/s] 

‚ö†Ô∏è ÈîôËØØ: Á¨¨5296Êúü - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2020/0110/c18038a411560/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


üì¶ Ê≠£Âú®Áà¨Âèñ:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 2146/4725 [05:01<04:36,  9.34it/s]

‚ö†Ô∏è ÈîôËØØ: Á¨¨4666Êúü - HTTPSConnectionPool(host='ysjt.ustc.edu.cn', port=443): Max retries exceeded with url: /2018/0531/c18038a383510/page.htm (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


üì¶ Ê≠£Âú®Áà¨Âèñ: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4725/4725 [10:20<00:00,  7.62it/s]


‚úÖ ÂÖ±‰øùÂ≠ò 4629 ÊúüÊï∞ÊçÆËá≥ vegetable_prices.json


In [32]:
import json
import pandas as pd

# ËØªÂèñ JSON Êñá‰ª∂
with open('vegetable_prices.json', 'r', encoding='utf-8') as file:
    vegetable_data = json.load(file)

# Áî®Êù•Â≠òÂÇ®Êï¥ÁêÜÂêéÁöÑÊï∞ÊçÆ
data = []

# ÈÅçÂéÜÊØè‰∏ÄÊúüÁöÑËî¨Ëèú‰ª∑Ê†ºÊï∞ÊçÆ
for period, details in vegetable_data.items():
    date = details[0]  # Êó•ÊúüÊòØÂàóË°®ÁöÑÁ¨¨‰∏Ä‰∏™ÂÖÉÁ¥†
    prices = details[1]  # Ëî¨Ëèú‰ª∑Ê†º‰ø°ÊÅØÊòØÂàóË°®ÁöÑÁ¨¨‰∫å‰∏™ÂÖÉÁ¥†
    
    for vegetable, price in prices.items():
        # Â∞ÜÊï∞ÊçÆÊ∑ªÂä†Âà∞ÂàóË°®‰∏≠
        data.append({
            'period': period,
            'vegetable': vegetable,
            'price': price,
            'date': date  # Ê∑ªÂä†Êó•Êúü‰ø°ÊÅØ
        })

# Â∞ÜÊï¥ÁêÜÂêéÁöÑÊï∞ÊçÆËΩ¨Êç¢‰∏∫ DataFrame
df = pd.DataFrame(data)

# ÊòæÁ§∫ÂâçÂá†Ë°åÊï∞ÊçÆ‰ª•È™åËØÅÁªìÊûú
print(df.head())

# Â∞ÜÊï¥ÁêÜÂêéÁöÑÊï∞ÊçÆ‰øùÂ≠ò‰∏∫ CSV Êñá‰ª∂
df.to_csv('vegetable_prices.csv', index=False)


  period vegetable  price        date
0   7149        Èü≠Ëèú    1.8  2025Âπ¥3Êúà31Êó•
1   7149       ÂõõÊúàËîì    1.0  2025Âπ¥3Êúà31Êó•
2   7149        Áì†Â≠ê    2.3  2025Âπ¥3Êúà31Êó•
3   7149      ÈìÅÊ£çÂ±±ËçØ    3.5  2025Âπ¥3Êúà31Êó•
4   7149       Ê∞¥ËäπËèú    3.5  2025Âπ¥3Êúà31Êó•
