In [65]:
from bs4 import BeautifulSoup
import httpx
from typing import Callable
from functools import reduce
import pandas as pd

PTT_OVER_18_HEADER = {"cookie": "over18=1"}

In [66]:
 # static function
 
 # add some new function 
    
def page_to_simple_dict(html_str: str , func_list:list[Callable] = None) -> dict:
    """
    format like
    '作者': 'ReiKuromiya (ReiKuromiya)',
    '標題': '[正妹] 周子瑜',
    '時間': 'Sun Jan  1 00:26:06 2023',
    'Year': '2023',
    'Month': 'Jan',
    'Day': '',
    'Week': 'Sun',
    'Time': '00:26:06',
    # 'Body': [(..., ...),  ...]}
    """

    def to_detail_date(date_str: str) -> dict:
        detail_date = date_str.split(" ")

        if "" in detail_date:
            detail_date.remove("")
        return {
            "Year": detail_date[-1],
            "Month": detail_date[1],
            "Day": detail_date[2],
            "Week": detail_date[0],
            "Time": detail_date[-2],
        }

    # header = {"cookie": "over18=1"}
    # result = httpx.get(url=url, headers=header)

    soup = BeautifulSoup(html_str, "html.parser")

    # get main data
    body_data = soup.find("div", class_="bbs-screen bbs-content", id="main-content")

    # get header data
    header_data = body_data.find_all("div", class_="article-metaline")

    tab_list = [
        str(line.find("span", class_="article-meta-tag").string)
        for line in header_data
    ]

    value_list = [
        str(line.find("span", class_="article-meta-value").string)
        for line in header_data
    ]

    header_dict = dict(zip(tab_list, value_list))

    if "時間" not in header_dict:
        # get text body #https://www.ptt.cc/bbs/Beauty/M.1690589266.A.166.html
        process_text = body_data.contents[2]
        process_text = process_text.split("\n")[:2]
        process_text = [item.split(":") for item in process_text]
        process_result_dict = {item[0]: item[1] for item in process_text}
        header_dict |= process_result_dict

    # image src
    images_catch_link = body_data.find_all("div", class_="richcontent")

    images_catch_link = [
        item.get("src")
        for image in images_catch_link
        if (item := image.find("img"))
    ]

    # image link lists
    image_link = body_data.find_all("a")
    image_link = [
        link_str
        for link in image_link
        if any(
            substring in (link_str := str(link.string))
            for substring in [".png", ".jpg", "jpeg", ".gif"]
        )
    ]

    page_data = (
        header_dict
        | to_detail_date(header_dict["時間"])
        | {
            "image_catch_list": images_catch_link,
            "image_link": image_link,
        }
    )
    
    if func_list is not None:
        addition_dict_list = reduce(lambda a , b : a | b , [func(soup) for func in func_list] )
        page_data |= addition_dict_list
        

    return page_data

In [67]:
html_page = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1672503968.A.5B5.html" , headers=PTT_OVER_18_HEADER)
# html_page = httpx.get("https://www.ptt.cc/bbs/Beauty/M.1672503968.A.5B5.html" , headers=PTT_OVER_18_HEADER)

In [70]:
def group_data_by_user_name(df:pd.DataFrame)->pd.DataFrame:
    return df.groupby('user_id').agg({
            'like_boo_type': 'first',
            'body': lambda x: list(x),
            'cnt': 'sum'
    }).reset_index()

def get_like_boo_count_dict(soup: BeautifulSoup) -> dict:
    
    def push_list_to_dict(item:BeautifulSoup) -> dict:
        
        
        span_item = [thing.string for thing in  item.find_all("span")]
        span_item = [str(thing).strip() for thing in  span_item]
        
        ip ,date_str , time_str = span_item[3].split(" ")
        
        return {
            "like_boo_type": span_item[0],
            "user_id": span_item[1],
            "body" : [{"comment" : span_item[2].replace(":" ,"").strip(),
                        "ip" : ip,
                        "date" : date_str,
                        "time" : time_str,
                    }],
            "cnt" : 1,
            
        }
    
    push_list = soup.find_all("div" , class_="push")
    
    
    all_comment  = [push_list_to_dict(item=item) for item in push_list]
   
    like_list = list(filter(lambda x : x["like_boo_type"] == "推" , all_comment))
    boo_list = list(filter(lambda x : x["like_boo_type"] == "噓" , all_comment))
    mid_list = list(filter(lambda x : x["like_boo_type"] == "→" , all_comment))
    
    
    like_table = pd.DataFrame(like_list)
    boo_table = pd.DataFrame(boo_list)
    mid_table = pd.DataFrame(mid_list)
    
    like_table = group_data_by_user_name(like_table)
    boo_table = group_data_by_user_name(boo_table)
    mid_table = group_data_by_user_name(mid_table)
    
    return {
        "like_table" : like_table,
        "boo_table": boo_table , 
        "mid_table": mid_table,
    }

In [71]:
page_dict = page_to_simple_dict(html_page.text , [get_like_boo_count_dict])
page_dict 

{'作者': 'ReiKuromiya (ReiKuromiya)',
 '標題': '[正妹] 周子瑜',
 '時間': 'Sun Jan  1 00:26:06 2023',
 'Year': '2023',
 'Month': 'Jan',
 'Day': '1',
 'Week': 'Sun',
 'Time': '00:26:06',
 'image_catch_list': ['https://cache.ptt.cc/c/https/i.imgur.com/BdmZ7Psl.jpg?e=1709288787&s=Oi9afBeydG3u17FZ886nqQ',
  'https://cache.ptt.cc/c/https/i.imgur.com/bBiw4ISl.jpg?e=1709257568&s=fTMSE-0hbwRoJSgozpe5ZA',
  'https://cache.ptt.cc/c/https/i.imgur.com/KDmRgegl.jpg?e=1709261149&s=1PMEkT1tGxY4FVgaofjdzg',
  'https://cache.ptt.cc/c/https/i.imgur.com/hRHpy8Xl.jpg?e=1709256477&s=Ym5hezDGtSl3g1_qwmZasw',
  'https://cache.ptt.cc/c/https/i.imgur.com/P5rI8UMl.jpg?e=1709253540&s=DcFO2D6xOnxhVa-xHtKkGg',
  'https://cache.ptt.cc/c/https/i.imgur.com/Uq0oG4El.jpg?e=1709271739&s=VsgZTRQdGxr6PZekOkbfEA',
  'https://cache.ptt.cc/c/https/i.imgur.com/pPXBHuZl.jpg?e=1709303665&s=8VM2J0Pu6VIvSxIb-VY-jw',
  'https://cache.ptt.cc/c/https/i.imgur.com/dFiDDhl.jpg?e=1709302537&s=3eI8qWuJJIkEaGFPnzl83w',
  'https://cache.ptt.cc/c/https

In [75]:
table = page_dict["mid_table"]
table

Unnamed: 0,user_id,like_boo_type,body,cnt
0,JyouItsu,→,"[[{'comment': '已經成路人了', 'ip': '182.233.34.159'...",1
1,Zzzip,→,"[[{'comment': '東南亞的感覺', 'ip': '42.72.255.1', '...",1
2,angel801109,→,"[[{'comment': '推', 'ip': '211.22.178.8', 'date...",1
3,chill247,→,"[[{'comment': '啊 長大啦', 'ip': '106.64.122.51', ...",1
4,d06,→,"[[{'comment': '依舊讚', 'ip': '58.26.140.38', 'da...",1
5,glion,→,"[[{'comment': '真的就是…各種長大了', 'ip': '223.140.115...",1
6,jetcheng,→,"[[{'comment': '子瑜過譽了', 'ip': '123.194.157.122'...",1
7,lalacos123,→,"[[{'comment': '小又I字 擠的啊', 'ip': '111.253.160.2...",1
8,logic886,→,"[[{'comment': '以前比較好', 'ip': '1.200.123.108', ...",1
9,orange7986,→,"[[{'comment': '長大了', 'ip': '42.77.194.240', 'd...",1


In [76]:
table[table["cnt"] > 1]

Unnamed: 0,user_id,like_boo_type,body,cnt
14,theclgy2001,→,"[[{'comment': '時間是把殺豬刀…. 難怪二馬拼了命都', 'ip': '220...",2


In [74]:
len(table[table["like_boo_type"] == "推"])

79

In [None]:
import pandas as pd

# Example data for table A
data_a = {
    'user_name': ['Alice', 'Bob', 'Alice'],
    'like_boo_type': ['like', 'boo', 'like'],
    'body': ['comment1', 'comment2', 'comment3'],
    # Add other columns here...
}

# Example data for table B
data_b = {
    'user_name': ['Charlie', 'Alice', 'Bob'],
    'like_boo_type': ['like', 'boo', 'like'],
    'body': ['comment4', 'comment5', 'comment6'],
    # Add other columns here...
}

# Convert data to pandas DataFrame
df_a = pd.DataFrame(data_a)
df_b = pd.DataFrame(data_b)

# Concatenate the two tables together
concatenated_df = pd.concat([df_a, df_b], ignore_index=True)

# Group by "user_name"
grouped_df = concatenated_df.groupby('user_name')

# Iterate over groups
for user_name, group_df in grouped_df:
    print("User:", user_name)
    print(group_df)
    print("\n")


User: Alice
  user_name like_boo_type      body
0     Alice          like  comment1
2     Alice          like  comment3
4     Alice           boo  comment5


User: Bob
  user_name like_boo_type      body
1       Bob           boo  comment2
5       Bob          like  comment6


User: Charlie
  user_name like_boo_type      body
3   Charlie          like  comment4




In [91]:
import pandas as pd

# 创建示例 DataFrame
data = {
    'date': ['0305', '0406', '0507', '0608', '0709', '0801', '0902', '1003', '1010', '1101', '1201']
}
# data["date"] = [f"2023-{item}" for item in data['date']]

df = pd.DataFrame(data)
df

Unnamed: 0,date
0,305
1,406
2,507
3,608
4,709
5,801
6,902
7,1003
8,1010
9,1101


In [92]:
df["date"] = [f"2023-{item}" for item in df['date']]

In [93]:
df["date"]

0     2023-0305
1     2023-0406
2     2023-0507
3     2023-0608
4     2023-0709
5     2023-0801
6     2023-0902
7     2023-1003
8     2023-1010
9     2023-1101
10    2023-1201
Name: date, dtype: object

In [95]:

# 将 "date" 列转换为 datetime 类型
df['date'] = pd.to_datetime(df['date'], format='%Y-%m%d')
df

Unnamed: 0,date
0,2023-03-05
1,2023-04-06
2,2023-05-07
3,2023-06-08
4,2023-07-09
5,2023-08-01
6,2023-09-02
7,2023-10-03
8,2023-10-10
9,2023-11-01


In [96]:

# 使用布尔索引选择日期范围为 "0305" 到 "1010" 的数据
filtered_df = df[(df['date'] >= '2023-03-05') & (df['date'] <= '2023-10-10')]

print(filtered_df)


        date
0 2023-03-05
1 2023-04-06
2 2023-05-07
3 2023-06-08
4 2023-07-09
5 2023-08-01
6 2023-09-02
7 2023-10-03
8 2023-10-10


In [97]:
import json

class CustomEncoder(json.JSONEncoder):
    def encode(self, o):
        if isinstance(o, dict) and len(o) <= 2:
            return "{" + ", ".join(f'"{k}": {json.dumps(v)}' for k, v in o.items()) + "}"
        else:
            return super().encode(o)

# 测试示例
dict1 = {'a': 1, 'b': 2}
dict2 = {'a': 1, 'b': 2, 'c': 3, 'd': 4}

json_str1 = json.dumps(dict1, cls=CustomEncoder, indent=4)
json_str2 = json.dumps(dict2, cls=CustomEncoder, indent=4)

print(json_str1)  # 单行显示
print(json_str2)  # 多行显示


{"a": 1, "b": 2}
{
    "a": 1,
    "b": 2,
    "c": 3,
    "d": 4
}


In [98]:
import json

class CustomEncoder(json.JSONEncoder):
    def encode(self, o):
        if isinstance(o, dict):
            new_dict = {k: v if not isinstance(v, list) else self.process_list(v) for k, v in o.items()}
            return super().encode(new_dict)
        else:
            return super().encode(o)
        
    def process_list(self, lst):
        if len(lst) == 2 and all(isinstance(item, dict) and len(item) == 2 for item in lst):
            return "[" + ", ".join(json.dumps(item) for item in lst) + "]"
        else:
            return lst

# 测试示例
dict1 = {'a': 1, 'b': 2, 'c': [{'x': 1, 'y': 2}, {'u': 3, 'v': 4}]}
dict2 = {'a': 1, 'b': 2, 'c': [{'x': 1, 'y': 2}, {'u': 3, 'v': 4}, {'m': 5, 'n': 6}]}

json_str1 = json.dumps(dict1, cls=CustomEncoder, indent=4)
json_str2 = json.dumps(dict2, cls=CustomEncoder, indent=4)

print(json_str1)
print(json_str2)


{
    "a": 1,
    "b": 2,
    "c": "[{\"x\": 1, \"y\": 2}, {\"u\": 3, \"v\": 4}]"
}
{
    "a": 1,
    "b": 2,
    "c": [
        {
            "x": 1,
            "y": 2
        },
        {
            "u": 3,
            "v": 4
        },
        {
            "m": 5,
            "n": 6
        }
    ]
}


In [1]:
my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

# 切割成长度为10的小块
chunk_size = 10
chunks = [my_list[i:i+chunk_size] for i in range(0, len(my_list), chunk_size)]

print(chunks)


[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21, 22]]
