In [60]:
from bs4 import BeautifulSoup
from rich import print
import httpx

In [75]:
def page_to_simple_dict(html_str: str) -> dict:
    """
    format like
    '作者': 'ReiKuromiya (ReiKuromiya)',
    '標題': '[正妹] 周子瑜',
    '時間': 'Sun Jan  1 00:26:06 2023',
    'Year': '2023',
    'Month': 'Jan',
    'Date': '',
    'Week': 'Sun',
    'Time': '00:26:06',
    'Body': [(..., ...),  ...]}
    """

    def to_detail_date(date_str: str) -> dict:
        detail_date = date_str.split(" ")
        return {
            "Year": detail_date[-1],
            "Month": detail_date[1],
            "Date": detail_date[2],
            "Week": detail_date[0],
            "Time": detail_date[-2],
        }

    # header = {"cookie": "over18=1"}
    # result = httpx.get(url=url, headers=header)

    soup = BeautifulSoup(html_str, "html.parser")

    # get main data
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")

    # get header data
    header_data = body_data.find_all("div", class_="article-metaline")

    tab_list = [
        str(line.find("span", class_="article-meta-tag").string) for line in header_data
    ]

    value_list = [
        str(line.find("span", class_="article-meta-value").string)
        for line in header_data
    ]

    header_dict = dict(zip(tab_list, value_list))
    
    
    
    # image src
    images_list = body_data.find_all("div", class_="richcontent")
    
    # base in how long in image_list
    images_list = [item.get("src") for image in images_list if (item := image.find("img"))]
    # print(images_list)

    # image link lists
    link_image = body_data.find_all("a")
    # {".png" , ".jpg" , "jpeg" ,".gif"} in (link_str := str(link.string))
    link_image = [link_str for link in link_image if any(substring in (link_str := str(link.string)) for substring in [".png" , ".jpg" , "jpeg" ,".gif"])  ]


    # content_images = list(zip(link_image[:len(images_list)], images_list))

    page_data = (
        header_dict | to_detail_date(header_dict["時間"]) | {"image_catch_list": images_list , "image_link" :link_image }
    )

    return page_data




In [68]:
result = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1672554775.A.108.html" , headers={"cookie": "over18=1"})



In [69]:
result.text

'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>[正妹] 中國 體育記者 巢怡雯 - 看板 Beauty - 批踢踢實業坊</title>\n<meta name="robots" content="all">\n<meta name="keywords" content="Ptt BBS 批踢踢">\n<meta name="description" content="https://imgur.com/5Pp3KmR.jpg\nhttps://imgur.com/u3XSwO9.jpg\nhttps://imgur.com/LbKBaau.jpg\nhttps://imgur.com/GMCfINQ.jpg\nhttps://imgur.com/gQRvw43.jpg\n">\n<meta property="og:site_name" content="Ptt 批踢踢實業坊">\n<meta property="og:title" content="[正妹] 中國 體育記者 巢怡雯">\n<meta property="og:description" content="https://imgur.com/5Pp3KmR.jpg\nhttps://imgur.com/u3XSwO9.jpg\nhttps://imgur.com/LbKBaau.jpg\nhttps://imgur.com/GMCfINQ.jpg\nhttps://imgur.com/gQRvw43.jpg\n">\n<link rel="canonical" href="https://www.ptt.cc/bbs/Beauty/M.1672554775.A.108.html">\n\n<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">\n<link rel="stylesheet" type="text/css" href

In [76]:
test = page_to_simple_dict(result.text)

print(test)

In [79]:
string = "Sun Jan  1 20:04:22 2023"
detail_date = string.split(" ")
detail_date.remove("")
detail_date

['Sun', 'Jan', '1', '20:04:22', '2023']

In [80]:
{
    "Year": detail_date[-1],
    "Month": detail_date[1],
    "Date": detail_date[2],
    "Week": detail_date[0],
    "Time": detail_date[-2],
}

{'Year': '2023',
 'Month': 'Jan',
 'Date': '1',
 'Week': 'Sun',
 'Time': '20:04:22'}