In [1]:
import requests 
from lxml import etree
import pandas as pd

In [2]:
def conn_web(url):
    headers = {
      "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()  
    content = response.content.decode('UTF-8')
    return content

In [3]:
def xpath_anchoring(content, data_list):
    html = etree.HTML(content)
    
    posts = html.xpath("//div[@class='r-list-container action-bar-margin bbs-screen']/div[@class='r-ent']")
    lastPage_url = html.xpath("//div[@class = 'btn-group btn-group-paging']/a[2]/@href")
    lastPage_url = f"https://www.ptt.cc{lastPage_url[0]}" if lastPage_url else None

    for post in posts:
        title = post.xpath("./div[@class='title']/a/text()")
        title = title[0] if title else None

        link = post.xpath("./div[@class='title']/a/@href")
        page_link = f"https://www.ptt.cc{link[0]}" if link else None

        author = post.xpath("./div[@class='meta']/div[@class='author']/text()")
        author = author[0] if author else None

        date = post.xpath("./div[@class='meta']/div[@class='date']/text()")
        date = date[0] if date else None

        post_data = {
            "title": title,
            "page_link": page_link,
            "author": author,
            "date": date
        }
        data_list.append(post_data)
    
    return data_list, lastPage_url

In [4]:
def public_static_pages_scraping(next_url, page_num):
    data_list = []
    
    for i in range(page_num):
        content = conn_web(next_url)
        if content is None:
            print(f"Skipping page {i} due to connection error.")
            continue
        data_list, next_url = xpath_anchoring(content, data_list)
        if not next_url:
            break
    return data_list

In [5]:
url = "https://www.ptt.cc/bbs/Stock/index.html"
pageNum = 10
data_list = public_static_pages_scraping(url, pageNum)
data_list = pd.DataFrame(data_list)

# Transform page_link into an Excel HYPERLINK formula
data_list["page_link"] = data_list.apply(
    lambda x: f'=HYPERLINK("{x["page_link"]}", "{x["page_link"]}")' if pd.notna(x["page_link"]) else "", 
    axis=1
)

data_list.to_excel("output.xlsx", index=False)
print("Excel file saved successfully!")

Excel file saved successfully!


In [10]:
data_list

Unnamed: 0,title,page_link,author,date
0,[請益] 突然都沒收到股利通知信,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",peggylovesk,4/27
1,Re: [請益] 股版閒聊文是不是很多起鬨的樂子人?,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",DustToDust,4/27
2,Re: [請益] 股版閒聊文是不是很多起鬨的樂子人?,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",kensmile,4/27
3,Re: [請益] 股版閒聊文是不是很多起鬨的樂子人?,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",zpeople0116,4/27
4,Re: [新聞] 台灣維他命價格比美國高三倍！出國帶貨,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",supermanrs,4/27
...,...,...,...,...
193,[情報] 2940歐都納 舉辦興櫃前法說會,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",addy7533967,4/25
194,[新聞] Trump says China's Xi has called him,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",jojoway,4/25
195,[情報] 3709 鑫聯大投控 3月自結 0.43,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",q1a1,4/25
196,Re: [新聞] Trump says China's Xi has called him,"=HYPERLINK(""https://www.ptt.cc/bbs/Stock/M.174...",ntpcgov,4/25
