In [3]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
import csv
from datetime import date, timedelta

In [4]:
BASE_DIR = "C:\\Users\\WINDOWS11\\Desktop\\kpop_agenda\\Step1\\Articles"  # Adjust if needed

In [None]:
def fetch_html(url):
    time.sleep(1 + 2 * time.random())  # Sleep 1-3 seconds
    headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a browser
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() # Raise exception for status codes 400, 500
        return BeautifulSoup(response.content, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [6]:
def get_articles_for_day(ymd, max_articles=10):
    ranking_url = f"https://m.entertain.naver.com/ranking?rankingDate={ymd}"

    # Use Selenium for dynamic content loading
    driver = webdriver.Chrome()  # Make sure ChromeDriver is in your PATH
    driver.get(ranking_url)

    try:
        element_present = EC.presence_of_element_located((By.CSS_SELECTOR, 'li.NewsItem_news_item__fhEmd')) # Wait for articles to load
        WebDriverWait(driver, 10).until(element_present)
    except:
        print("Timed out waiting for element")
        driver.quit()
        return [] # Return empty list if scraping fails

    soup = BeautifulSoup(driver.page_source, "html.parser") # Get rendered HTML
    driver.quit()

    article_nodes = soup.select("li.NewsItem_news_item__fhEmd")

    articles = []
    for i, node in enumerate(article_nodes[:min(len(article_nodes), max_articles)]):
        rank_tag = node.select_one("div.NewsItem_rank_area__wqmaA span.blind")
        rank = int(rank_tag.text) if rank_tag else 0

        a_tag = node.select_one("a")
        link = a_tag["href"] if a_tag else ""

        title_tag = node.select_one("em.NewsItem_title__BXkJ6")
        title = title_tag.text if title_tag else ""

        views_tag = node.select_one("span.NewsItem_view__Tep-c")
        views_str = views_tag.text.replace("조회수", "").replace(",", "").strip() if views_tag else ""
        views = int(views_str) if views_str.isdigit() else 0

        company = link.split("/")[-2] if link else "unknown"

        file_path = os.path.join(BASE_DIR, f"{ymd}_article_{i+1}.txt")
        articles.append([0, rank, views, company, title, link, file_path])

    return articles


In [None]:
def main():
    start_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)

    all_articles = []
    article_id = 1

    current_date = start_date
    while current_date <= end_date:
        ymd = current_date.strftime("%Y%m%d")
        try:
            daily_articles = get_articles_for_day(ymd)
            for article in daily_articles:
                article[0] = article_id  # Set ID
                all_articles.append(article)
                article_id += 1
        except Exception as e:
            print(f"Error on date {ymd}: {e}")

        current_date += timedelta(days=1)

    # Write to TSV
    with open(os.path.join(BASE_DIR, "articles_metadata.tsv"), "w", newline="", encoding="utf-8") as tsvfile:
        writer = csv.writer(tsvfile, delimiter="\t")
        writer.writerow(["ID", "rank", "views", "company", "title", "link", "file_path"])  # Header row
        writer.writerows(all_articles)

    print("Done! Saved TSV to:", os.path.join(BASE_DIR, "articles_metadata.tsv"))


if __name__ == "__main__":
    main()