In [3]:
from bs4 import BeautifulSoup
import httpx
from typing import Callable
from functools import reduce
from rich import print
PTT_OVER_18_HEADER = {"cookie": "over18=1"}

In [4]:

def page_to_simple_dict(html_str: str, func_list: list[Callable] = None) -> dict:
    """
    format like
    '作者': 'ReiKuromiya (ReiKuromiya)',
    '標題': '[正妹] 周子瑜',
    '時間': 'Sun Jan  1 00:26:06 2023',
    'Year': '2023',
    'Month': 'Jan',
    'Day': '',
    'Week': 'Sun',
    'Time': '00:26:06',
    # 'Body': [(..., ...),  ...]}
    """

    def to_detail_date(date_str: str) -> dict:
        detail_date = date_str.split(" ")

        if "" in detail_date:
            detail_date.remove("")
        return {
            "Year": detail_date[-1],
            "Month": detail_date[1],
            "Day": detail_date[2],
            "Week": detail_date[0],
            "Time": detail_date[-2],
        }

    # header = {"cookie": "over18=1"}
    # result = httpx.get(url=url, headers=header)

    soup = BeautifulSoup(html_str, "html.parser")

    # get main data
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")

    # get header data
    header_data = body_data.find_all("div", class_="article-metaline")

    tab_list = [
        str(line.find("span", class_="article-meta-tag").string)
        for line in header_data
    ]

    value_list = [
        str(line.find("span", class_="article-meta-value").string)
        for line in header_data
    ]

    header_dict = dict(zip(tab_list, value_list))

    if "時間" not in header_dict:
        # get text body #https://www.ptt.cc/bbs/Beauty/M.1690589266.A.166.html
        process_text = body_data.contents[2]
        process_text = process_text.split("\n")[:2]
        process_text = [item.split(":") for item in process_text]
        process_result_dict = {item[0]: item[1] for item in process_text}
        header_dict |= process_result_dict

    

    page_data = (
        header_dict
        | to_detail_date(header_dict["時間"])
        
    )

    if func_list is not None:
        addition_dict_list = reduce(
            lambda a, b: a | b, [func(soup) for func in func_list]
        )
        page_data |= addition_dict_list

    return page_data

In [5]:
def get_images_from_page(soup:BeautifulSoup)->dict:
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")
    # image src
    images_catch_link = body_data.find_all("div", class_="richcontent")

    images_catch_link = [
        item.get("src")
        for image in images_catch_link
        if (item := image.find("img"))
    ]

    # image link lists
    image_link = body_data.find_all("a")
    image_link = [
        link_str
        for link in image_link
        if any(
            substring in (link_str := str(link.string))
            for substring in [".png", ".jpg", "jpeg", ".gif"]
        )
    ]
    return  {
            "image_catch_list": images_catch_link,
            "image_link": image_link,
        }

In [6]:
# test_response = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1692517466.A.D58.html" , headers=PTT_OVER_18_HEADER)
# test_response = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1672503968.A.5B5.html" , headers=PTT_OVER_18_HEADER)
# test_response = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1678508772.A.163.html" , headers=PTT_OVER_18_HEADER)
test_response = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1672510614.A.C61.html" , headers=PTT_OVER_18_HEADER)
test_response.text

'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>[正妹] 六兔興旺 - 看板 Beauty - 批踢踢實業坊</title>\n<meta name="robots" content="all">\n<meta name="keywords" content="Ptt BBS 批踢踢">\n<meta name="description" content="https://i.imgur.com/XaKrUlM.jpg 歳末新春SUPERキャンペーン\nhttps://i.imgur.com/uXh36LW.png miru\nhttps://i.imgur.com/IdWjvsc.png\nhttps://i.imgur.com/0gAyDJP.png\nhttps://i.imgur.com/YOO1vKw.png\n">\n<meta property="og:site_name" content="Ptt 批踢踢實業坊">\n<meta property="og:title" content="[正妹] 六兔興旺">\n<meta property="og:description" content="https://i.imgur.com/XaKrUlM.jpg 歳末新春SUPERキャンペーン\nhttps://i.imgur.com/uXh36LW.png miru\nhttps://i.imgur.com/IdWjvsc.png\nhttps://i.imgur.com/0gAyDJP.png\nhttps://i.imgur.com/YOO1vKw.png\n">\n<link rel="canonical" href="https://www.ptt.cc/bbs/Beauty/M.1672510614.A.C61.html">\n\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.cs

In [46]:
def get_body_content(soup:BeautifulSoup)->dict:
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")
    
    body_text = body_data.text.strip()
    
    index_end = body_text.find("※ 發信站")
    # must have ※ 發信站
    body_text = body_text[:index_end]
    print(body_text)
    
    return {"is_can_use" : bool(index_end != -1) ,  "body_content":body_text }

In [47]:
page_dict = page_to_simple_dict(test_response.text,func_list=[get_body_content , get_images_from_page] ) # , func_list=[get_images_from_page]
print(page_dict)
# print(f"image_catch_list size : {len(page_dict['image_catch_list'])}")
# print(f"image_link size : {len(page_dict['image_link'])}")