In [1]:
import requests
import datetime
from bs4 import BeautifulSoup
from dateutil import parser
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [2]:
def get_ptt_date(url):
    my_headers = {'cookie': 'over18=1;'}
    response = requests.get(url, headers=my_headers)
    soup = BeautifulSoup(response.text, "html.parser")
    main_container = soup.find(id='main-container')
    datetime_text = main_container.find_all(class_='article-meta-value')[3].text
    return parser.parse(datetime_text)

In [3]:
def process_post(args):
    t, target_date = args
    temp_title = t.text.strip()
    if '食記' in temp_title:
        temp_link = 'https://www.ptt.cc' + t.a['href']
        try:
            temp_date = get_ptt_date(temp_link)
            if temp_date <= target_date:
                date_text = temp_date.strftime('%Y/%m/%d')
                return {'title': temp_title, 'link': temp_link, 'date': date_text}
        except:
            print('Error in get date, url: ', temp_link)
            return None

In [4]:
def get_ptt_posts(url, target_date, post_list):
    my_headers = {'cookie': 'over18=1;'}
    response = requests.get(url, headers=my_headers)
    soup = BeautifulSoup(response.text, "html.parser")
    titles = soup.find_all('div', class_='title')
    next_page = soup.select_one('a.btn.wide:nth-of-type(2)')['href']
    
    # 使用ThreadPoolExecutor创建线程池
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_post, [(t, target_date) for t in titles]))
    
    post_list.extend([result for result in results if result is not None])
    return next_page

In [5]:
all_post = []
target_date = datetime.datetime(2022, 11, 30)
url = 'https://www.ptt.cc/bbs/Food/index.html'

while len(all_post) < 1000:
    next_page = get_ptt_posts(url, target_date, all_post)
    url = 'https://www.ptt.cc' + next_page
    print(url, 'post count: ', len(all_post))

https://www.ptt.cc/bbs/Food/index7002.html post count:  0
https://www.ptt.cc/bbs/Food/index7001.html post count:  0
https://www.ptt.cc/bbs/Food/index7000.html post count:  0
https://www.ptt.cc/bbs/Food/index6999.html post count:  0
https://www.ptt.cc/bbs/Food/index6998.html post count:  0
https://www.ptt.cc/bbs/Food/index6997.html post count:  0
https://www.ptt.cc/bbs/Food/index6996.html post count:  0
https://www.ptt.cc/bbs/Food/index6995.html post count:  0
https://www.ptt.cc/bbs/Food/index6994.html post count:  0
https://www.ptt.cc/bbs/Food/index6993.html post count:  0
https://www.ptt.cc/bbs/Food/index6992.html post count:  0
https://www.ptt.cc/bbs/Food/index6991.html post count:  0
https://www.ptt.cc/bbs/Food/index6990.html post count:  0
https://www.ptt.cc/bbs/Food/index6989.html post count:  0
https://www.ptt.cc/bbs/Food/index6988.html post count:  0
https://www.ptt.cc/bbs/Food/index6987.html post count:  0
https://www.ptt.cc/bbs/Food/index6986.html post count:  0
https://www.pt

In [6]:
post_df = pd.DataFrame(all_post, columns=['title','link','date'])
post_df

Unnamed: 0,title,link,date
0,[食記] 台北 辰壽司割烹~大閘蟹宴,https://www.ptt.cc/bbs/Food/M.1669642084.A.CE7...,2022/11/28
1,[食記] 新北 金大鋤壽喜燒烤鍋物新莊店下午茶,https://www.ptt.cc/bbs/Food/M.1669646872.A.E61...,2022/11/28
2,[食記] 新北林口｜林口三井的點點心,https://www.ptt.cc/bbs/Food/M.1669680859.A.700...,2022/11/29
3,[食記] 台北松山 夯・魯肉飯 原本的五分埔魯肉飯,https://www.ptt.cc/bbs/Food/M.1669706692.A.EFC...,2022/11/29
4,[食記] 桃園。麵屋虎千代,https://www.ptt.cc/bbs/Food/M.1669708111.A.E69...,2022/11/29
...,...,...,...
1001,[食記] 屏東潮州-牛大福。屏東牛肉料理,https://www.ptt.cc/bbs/Food/M.1664507458.A.84A...,2022/09/30
1002,[食記] 個人評比 肉次方VS夯下去,https://www.ptt.cc/bbs/Food/M.1664508970.A.AB0...,2022/09/30
1003,[食記] 桃園龍潭 糧園茶藝客家小館,https://www.ptt.cc/bbs/Food/M.1664512596.A.9FD...,2022/09/30
1004,[食記] 台北信義安和 吉可頌丹麥專賣店,https://www.ptt.cc/bbs/Food/M.1664512757.A.97B...,2022/09/30


In [7]:
def get_ptt_content(URL):
   
    my_headers = {'cookie': 'over18=1;'}
    response = requests.get(URL, headers = my_headers)

    #Get Content       
    soup = BeautifulSoup(response.text,"html.parser")

    main_container = soup.find(id='main-container')
    all_text = main_container.text
    pre_text = all_text.split('※ 發信站:')[0]
    pre_text = pre_text.strip().replace('\n--','')
    
    # 把每段文字 根據 '\n' 切開
    texts = list(filter(None, pre_text.split('\n')))[1:]
    return texts

In [8]:
post_df['content'] = post_df['link'].apply(get_ptt_content)
post_df

Unnamed: 0,title,link,date,content
0,[食記] 台北 辰壽司割烹~大閘蟹宴,https://www.ptt.cc/bbs/Food/M.1669642084.A.CE7...,2022/11/28,"[ 餐廳名稱：辰壽司割烹, 消費時間：2022年/11月, 地址：台北市松山..."
1,[食記] 新北 金大鋤壽喜燒烤鍋物新莊店下午茶,https://www.ptt.cc/bbs/Food/M.1669646872.A.E61...,2022/11/28,"[鍋物最前線--金大鋤壽喜燒烤鍋物新莊店下午茶, 消費日期：2022 年 11 月, 有圖有..."
2,[食記] 新北林口｜林口三井的點點心,https://www.ptt.cc/bbs/Food/M.1669680859.A.700...,2022/11/29,"[ 餐廳名稱：林口三井OUTLET|放鬆心情 來吃港式的點點心, 消費時間：202..."
3,[食記] 台北松山 夯・魯肉飯 原本的五分埔魯肉飯,https://www.ptt.cc/bbs/Food/M.1669706692.A.EFC...,2022/11/29,"[ 餐廳名稱：夯・魯肉飯, 消費時間：2022年/10月, 110台北市信義區松山..."
4,[食記] 桃園。麵屋虎千代,https://www.ptt.cc/bbs/Food/M.1669708111.A.E69...,2022/11/29,"[ 餐廳名稱：麵屋虎千代, 消費時間：2022年/11月, 地址：桃園市桃園..."
...,...,...,...,...
1001,[食記] 屏東潮州-牛大福。屏東牛肉料理,https://www.ptt.cc/bbs/Food/M.1664507458.A.84A...,2022/09/30,"[ 餐廳名稱：屏東。潮州》牛大福。屏東牛肉料理, 消費時間：2022年/8月, ..."
1002,[食記] 個人評比 肉次方VS夯下去,https://www.ptt.cc/bbs/Food/M.1664508970.A.AB0...,2022/09/30,"[王品集團與馬辣集團系列的吃到飽，一直都是, 不雷的保證，以下為個人親身吃過的評比，不, 喜..."
1003,[食記] 桃園龍潭 糧園茶藝客家小館,https://www.ptt.cc/bbs/Food/M.1664512596.A.9FD...,2022/09/30,"[圖文網誌版請點：https://pse.is/4hknuu, 店名：糧園茶藝客家小館, 電..."
1004,[食記] 台北信義安和 吉可頌丹麥專賣店,https://www.ptt.cc/bbs/Food/M.1664512757.A.97B...,2022/09/30,[圖文網誌版請點：https://guessworld.com.tw/mycroissant...


In [9]:
empty_content_rows = post_df[post_df['content'].apply(len) == 0]

# 打印符合条件的行
print(len(empty_content_rows))

0


In [10]:
post_df.to_csv('./Data/Ptt/ptt_food_post_list.csv', index=True, index_label='Pid', encoding='utf-8-sig')