In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import time
import random
import os
from tqdm import tqdm  # <--- Import ตัวนี้มาใช้
import re

In [2]:
# ================= Configuration =================
START_ID = 693036
TMP_END_ID = 2901545 # as of 22 december 2025
NUMBER_OF_ARTICLES = TMP_END_ID - START_ID
HTML_STORAGE_DIR = 'crawled_data/raw_html'      # เก็บ HTML
PARQUET_STORAGE_DIR = 'crawled_data/metadata'   # เก็บ Parquet ทีละไฟล์

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# =================================================

In [3]:
def setup_directories():
    """สร้างโฟลเดอร์ถ้ายังไม่มี"""
    os.makedirs(HTML_STORAGE_DIR, exist_ok=True)
    os.makedirs(PARQUET_STORAGE_DIR, exist_ok=True)

def save_raw_html(news_id, content_bytes):
    """บันทึก HTML ลงไฟล์"""
    filename = f"{news_id}.html"
    file_path = os.path.join(HTML_STORAGE_DIR, filename)
    with open(file_path, "wb") as f:
        f.write(content_bytes)
    return file_path

def save_metadata_parquet(data_dict):
    """
    บันทึก Metadata ของข่าว 1 ข่าว เป็นไฟล์ Parquet แยกต่างหาก
    """
    news_id = data_dict['id']
    file_path = os.path.join(PARQUET_STORAGE_DIR, f"{news_id}.parquet")
    
    # แปลง Dict เดียวให้เป็น DataFrame (1 row)
    df = pd.DataFrame([data_dict])
    
    # บันทึกเป็น Parquet
    # ใช้ compression='snappy' (default) เพื่อประหยัดเนื้อที่และเร็ว
    df.to_parquet(file_path, index=True, engine='pyarrow')
    
    return file_path

def get_news_content(news_id):
    url = f"https://www.thairath.co.th/news/{news_id}"
    
    try:
        # ตรวจสอบก่อนว่าเคยโหลดไปแล้วหรือยัง (Optional: เพื่อ Resume งานได้)
        parquet_path = os.path.join(PARQUET_STORAGE_DIR, f"{news_id}.parquet")
        if os.path.exists(parquet_path):
            # print(f"[*] ID {news_id}: Already exists, skipping.")
            # tqdm.write(f"[*] ID {news_id}: Skipped (Exists)")
            return "Skipped"

        response = requests.get(url, headers=HEADERS, timeout=10, allow_redirects=True)
        
        if response.status_code == 404:
            # tqdm.write(f"[-] ID {news_id}: Not Found")
            return None
        
        
        if response.status_code != 200:
            # tqdm.write(f"[-] ID {news_id}: Status {response.status_code}") # Show only errors
            return None

        # 1. Save HTML (Raw Data)
        local_html_path = save_raw_html(news_id, response.content)

        # 2. Parse Data (Metadata)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = soup.find('h1')
        title_text = title.text.strip() if title else "No Title"

        date_tag = soup.find(class_=re.compile(r'.*__item_article-date$'))
        date_text = date_tag.text.strip() if date_tag else "No Date"

        # --- Article Content (itemprop="articleBody") ---
        # หา container ใหญ่ที่มี attribute itemprop="articleBody"
        article_container = soup.find(attrs={"itemprop": "articleBody"})

        content_text = ""
        if article_container:
            # หา p ทั้งหมดที่อยู่ใน container นี้ (ไม่ว่าจะซ้อน div กี่ชั้นก็ตาม)
            paragraphs = article_container.find_all('p')
            
            # กรองเฉพาะ p ที่มีข้อความ (เผื่อเจอ p ว่างๆ)
            content_list = [p.text.strip() for p in paragraphs if p.text.strip()]
            content_text = ' '.join(content_list)
        else:
            content_text = "Content Not Found"
        
        # Data Dictionary
        data = {
            'id': news_id,
            'web_url': response.url,
            'source_url': local_html_path, # Path ไปยังไฟล์ HTML
            'title': title_text,
            'date': date_text,
            'crawled_timestamp': pd.Timestamp.now() # เก็บเวลาที่ Crawl ด้วย
        }

        # 3. Save Parquet immediately (Metadata) - บันทึกทันทีใน Step นี้
        save_metadata_parquet(data)

        # tqdm.write(f"[+] ID {news_id}: Saved")
        return data

    except Exception as e:
        tqdm.write(f"[!] ID {news_id}: Error - {e}")
        return None

In [None]:
def main():
    setup_directories()
    
    ids_to_crawl = list(range(START_ID, START_ID + NUMBER_OF_ARTICLES, 1))
    total_tasks = len(ids_to_crawl)
    
    print(f"Starting crawl. Data will be saved to '{PARQUET_STORAGE_DIR}/' step-by-step.")
    print(f"Storage: {PARQUET_STORAGE_DIR}")

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        # ยิง Task เข้าไป
        future_to_id = {executor.submit(get_news_content, news_id): news_id for news_id in ids_to_crawl}
        
        for future in tqdm(concurrent.futures.as_completed(future_to_id), total=total_tasks, unit="news"):
            # แค่รอให้เสร็จ เพื่อคุม Flow หรือจับ Error แต่ไม่ต้องรอรวม List แล้ว เพราะ Save แยกไฟล์ไปแล้ว
            try:
                future.result()
            except Exception as exc:
                print(f"Generated an exception: {exc}")
            
            time.sleep(random.uniform(0.5, 1.5))

    print("\nCrawl Finished.")
    print("To merge all parquet files into one DataFrame, run the code snippet below.")


main()

Starting crawl. Data will be saved to 'crawled_data/metadata/' step-by-step.
Storage: crawled_data/metadata


  0%|          | 2714/2208509 [45:37<618:43:22,  1.01s/news]

[!] ID 1084830: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 3009/2208509 [50:29<593:42:22,  1.03news/s]

[!] ID 1123399: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[!] ID 1123398: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 6685/2208509 [1:51:55<560:57:26,  1.09news/s]

[!] ID 1705431: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[!] ID 1705432: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 6842/2208509 [1:54:31<533:44:31,  1.15news/s]

[!] ID 1729861: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 7218/2208509 [2:00:53<676:26:55,  1.11s/news]

[!] ID 1789438: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[!] ID 1789439: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[!] ID 1789443: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 8087/2208509 [2:15:16<590:11:06,  1.04news/s]

[!] ID 1915970: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 8499/2208509 [2:22:12<523:24:22,  1.17news/s]

[!] ID 1932940: Error - Response ended prematurely


  0%|          | 8545/2208509 [2:22:56<591:46:32,  1.03news/s]

[!] ID 1934277: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  0%|          | 8550/2208509 [2:23:01<674:20:36,  1.10s/news]

[!] ID 1934367: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|          | 12606/2208509 [3:30:12<518:46:54,  1.18news/s]

[!] ID 2536894: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|          | 13543/2208509 [3:45:59<662:52:33,  1.09s/news]

[!] ID 2670692: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|          | 14870/2208509 [4:07:45<566:56:54,  1.07news/s]

[!] ID 2852545: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  1%|          | 15041/2208509 [4:10:39<614:11:19,  1.01s/news]

[!] ID 2868856: Error - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


  2%|▏         | 48118/2208509 [13:21:50<680:34:09,  1.13s/news]