In [141]:
from bs4 import BeautifulSoup
import httpx
from typing import Callable
from functools import reduce
import pandas as pd 
import asyncio
PTT_OVER_18_HEADER = {"cookie": "over18=1"}

In [82]:
def recommend_page_to_simple_dict(html_str: str) -> dict:
    """for recommend page like index.html"""

    def item_change_to_dict(div_obj: BeautifulSoup) -> dict:
        hot_number = div_obj.find("span")

        hot_number = str(hot_number.string) if hot_number is not None else 0

        if isinstance(hot_number, str) and hot_number.isdigit():
            hot_number = int(hot_number)

        title = div_obj.find("a")

        title_str, title_url = (
            title.string if title else "",
            title.get("href") if title else "",
        )

        author = div_obj.find("div", class_="author")
        author = str(author.string) if author is not None else ""

        date = div_obj.find("div", class_="date")
        date = str(date.string) if date is not None else ""

        return {
            "Title": title_str,
            "Author": author,
            "HotNumber": hot_number,
            "Date": date,
            "URL": title_url,
        }

    def filter_function(item_dict: dict) -> bool:
        if "[公告]" in item_dict["Title"]:
            return False

        if "Fw:[公告]" in item_dict["Title"]:
            return False

        if item_dict["Title"] == "":
            return False

        if item_dict["URL"] == "":
            return False

        return True

    # header = {"cookie": "over18=1"}
    # result = httpx.get(url=url, headers=header)
    soup = BeautifulSoup(html_str, "html.parser")

    # action button -> button dict
    action_bar_button = soup.find_all("div", class_="btn-group btn-group-paging")[0]
    action_bar_button = action_bar_button.find_all("a")

    location = ["Old", "Prev", "Next", "New"]
    button_link = [item.get("href", "") for item in action_bar_button]

    button_dict = dict(zip(location, button_link))
    ##############################################

    # get recommend list -> detail dict
    recommend_list = soup.find(
        "div", class_="r-list-container action-bar-margin bbs-screen"
    )
    recommend = recommend_list.find_all("div", class_="r-ent")
    recommend_result = [item_change_to_dict(div_obj=item) for item in recommend]
    recommend_after_filter_result = list(filter(filter_function, recommend_result))

    return button_dict | {"Body": recommend_after_filter_result}

In [120]:
def page_to_simple_dict(html_str: str, func_list: list[Callable] = None) -> dict:
    """
    format like
    '作者': 'ReiKuromiya (ReiKuromiya)',
    '標題': '[正妹] 周子瑜',
    '時間': 'Sun Jan  1 00:26:06 2023',
    'Year': '2023',
    'Month': 'Jan',
    'Day': '',
    'Week': 'Sun',
    'Time': '00:26:06',
    # 'Body': [(..., ...),  ...]}
    """

    def to_detail_date(date_str: str) -> dict:
        detail_date = date_str.split(" ")

        if "" in detail_date:
            detail_date.remove("")
        return {
            "Year": detail_date[-1],
            "Month": detail_date[1],
            "Day": detail_date[2],
            "Week": detail_date[0],
            "Time": detail_date[-2],
        }
        
    def to_pd_time(detail_date:dict)->pd.Timestamp:
        date_str = f"{detail_date.get('Year' , '')}-{detail_date.get('Month', '')}-{detail_date.get('Day', '')} {detail_date.get('Time', '')}"
        return {"pd_time" : pd.to_datetime(date_str)}

    soup = BeautifulSoup(html_str, "html.parser")

    # get main data
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")

    # get header data
    header_data = body_data.find_all("div", class_="article-metaline")

    tab_list = [
        str(line.find("span", class_="article-meta-tag").string)
        for line in header_data
    ]

    value_list = [
        str(line.find("span", class_="article-meta-value").string)
        for line in header_data
    ]

    header_dict = dict(zip(tab_list, value_list))

    if "時間" not in header_dict:
        # get text body #https://www.ptt.cc/bbs/Beauty/M.1690589266.A.166.html
        process_text = body_data.contents[2]
        process_text = process_text.split("\n")[:2]
        process_text = [item.split(":") for item in process_text]
        process_result_dict = {item[0]: item[1] for item in process_text}
        header_dict |= process_result_dict

    detail_date = to_detail_date(header_dict["時間"])
    pd_time = to_pd_time(detail_date)
    
    page_data = header_dict | detail_date | pd_time

    if func_list is not None:
        addition_dict_list = reduce(
            lambda a, b: a | b, [func(soup) for func in func_list]
        )
        page_data |= addition_dict_list

    return page_data

In [121]:
def get_header():
    return PTT_OVER_18_HEADER | {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"}

In [122]:
result = httpx.get("https://www.ptt.cc/bbs/Beauty/index3999.html" , headers=get_header())
result_dict = recommend_page_to_simple_dict(result.text)
result_dict

{'Old': '/bbs/Beauty/index1.html',
 'Prev': '/bbs/Beauty/index3998.html',
 'Next': '/bbs/Beauty/index4000.html',
 'New': '/bbs/Beauty/index.html',
 'Body': [{'Title': '[正妹] 對岸車模小玥玥',
   'Author': 'fujiohuang',
   'HotNumber': 2,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709377543.A.F27.html'},
  {'Title': '[正妹] Cosplay 659 中國 下乳',
   'Author': 'Gentlemon',
   'HotNumber': 0,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709378641.A.532.html'},
  {'Title': '[正妹] 每日一尻',
   'Author': 'jerryyuan',
   'HotNumber': 1,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709382080.A.897.html'},
  {'Title': '[正妹] 泡',
   'Author': 'YuiLover',
   'HotNumber': 3,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709383585.A.06A.html'},
  {'Title': '[正妹] IVE 金秋天',
   'Author': 'YuiLover',
   'HotNumber': 29,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709386024.A.B9E.html'},
  {'Title': '[正妹] 兩寶媽 Irene 圓圓',
   'Author': 'asxc530530',
   'HotNumber': 13,
   'Date': ' 3/02',
   'URL': '/bbs/Beauty/M.1709

In [123]:
test_page_dict = result_dict["Body"][0]
test_page_dict

{'Title': '[正妹] 對岸車模小玥玥',
 'Author': 'fujiohuang',
 'HotNumber': 2,
 'Date': ' 3/02',
 'URL': '/bbs/Beauty/M.1709377543.A.F27.html'}

In [124]:
full_link = lambda url: f"https://www.ptt.cc{url}"

In [125]:
url = full_link(test_page_dict["URL"])
url

'https://www.ptt.cc/bbs/Beauty/M.1709377543.A.F27.html'

In [126]:
test_page = httpx.get(url, headers=get_header())
test_page

<Response [200 OK]>

In [127]:
test_page.content

b'<!DOCTYPE html>\n<html>\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t\n\n<meta name="viewport" content="width=device-width, initial-scale=1">\n\n<title>[\xe6\xad\xa3\xe5\xa6\xb9] \xe5\xb0\x8d\xe5\xb2\xb8\xe8\xbb\x8a\xe6\xa8\xa1\xe5\xb0\x8f\xe7\x8e\xa5\xe7\x8e\xa5 - \xe7\x9c\x8b\xe6\x9d\xbf Beauty - \xe6\x89\xb9\xe8\xb8\xa2\xe8\xb8\xa2\xe5\xaf\xa6\xe6\xa5\xad\xe5\x9d\x8a</title>\n<meta name="robots" content="all">\n<meta name="keywords" content="Ptt BBS \xe6\x89\xb9\xe8\xb8\xa2\xe8\xb8\xa2">\n<meta name="description" content="https://i.imgur.com/DgEpPJ3.jpg\nhttps://i.imgur.com/bse2pqp.jpg\nhttps://i.imgur.com/CqaErqC.jpg\nhttps://i.imgur.com/C7XH1kQ.jpg\nhttps://i.imgur.com/QR4Saj6.jpg\n">\n<meta property="og:site_name" content="Ptt \xe6\x89\xb9\xe8\xb8\xa2\xe8\xb8\xa2\xe5\xaf\xa6\xe6\xa5\xad\xe5\x9d\x8a">\n<meta property="og:title" content="[\xe6\xad\xa3\xe5\xa6\xb9] \xe5\xb0\x8d\xe5\xb2\xb8\xe8\xbb\x8a\xe6\xa8\xa1\xe5\xb0\x8f\xe7\x8e\xa5\xe7\x8e\xa5">\n<meta property="og:description"

In [128]:
page_dict = page_to_simple_dict(test_page.content)
page_dict

{'作者': 'fujiohuang (啪噠碰咚鏘)',
 '標題': '[正妹] 對岸車模小玥玥',
 '時間': 'Sat Mar  2 19:05:41 2024',
 'Year': '2024',
 'Month': 'Mar',
 'Day': '2',
 'Week': 'Sat',
 'Time': '19:05:41',
 'pd_time': Timestamp('2024-03-02 19:05:41')}

In [131]:
check_date = pd.to_datetime("2023-01-01-00:26:06")
check_date

Timestamp('2023-01-01 00:26:06')

In [133]:
test_date = page_dict['pd_time']
test_date

Timestamp('2024-03-02 19:05:41')

In [137]:
test_date

Timestamp('2024-03-02 19:05:41')

In [136]:
test_date_2 = pd.to_datetime("2022-12-31-00:26:06")

In [138]:
test_date_2 < check_date < test_date

True

In [149]:
async def page_time_range(url:str , client:httpx.AsyncClient)->list[pd.Timestamp]:
    recommend_page_response = await client.get(url, headers=get_header())
    page_dict = recommend_page_to_simple_dict(recommend_page_response.content)
    page_body = page_dict["Body"]
    full_link = lambda url: f"https://www.ptt.cc{url}"
    
    start_page_url , end_page_utl = full_link(page_body[0]["URL"]) , full_link(page_body[-1]["URL"])
    
    start_page_response , end_page_response = await asyncio.gather(
        client.get(start_page_url , headers=get_header()) , 
        client.get(end_page_utl , headers=get_header())
    )

    
    start_page_dict = page_to_simple_dict(start_page_response.content)
    end_page_dict = page_to_simple_dict(end_page_response.content)
    
    return [start_page_dict["pd_time"] , end_page_dict["pd_time"]]

async def find_page_by_time(page_time:str , client:httpx.AsyncClient):
    target_page_time = pd.to_datetime(page_time)
    to_ptt_index_url = lambda num: f"https://www.ptt.cc/bbs/Beauty/index{num}.html"
    left , right = 1 , 3999 
    
    while left <= right:
        mid_number = (left + right) // 2
        ptt_index_url = to_ptt_index_url(mid_number)
        
        print(f"Now html: {ptt_index_url}" , end="\r")
        
        time_range = await page_time_range(ptt_index_url , client)
        
        in_range  = time_range[0] <= target_page_time <= time_range[-1]
        
        # find range
        if in_range:
            return ptt_index_url
        
        if target_page_time < time_range[0]:
            right = mid_number - 1
            
        elif target_page_time > time_range[-1]:
            left = mid_number + 1
    return ""

In [150]:
async with httpx.AsyncClient() as client:
    test_result = await find_page_by_time("2023-01-01-00:26:06" , client)


Now html: https://www.ptt.cc/bbs/Beauty/index3656.html

In [151]:
test_result

'https://www.ptt.cc/bbs/Beauty/index3656.html'