In [3]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urlparse

# ============================
# PATH FILE
# ============================
BASE_DIR = Path.cwd()
URL_FILE = BASE_DIR / "data" / "urls.txt"
OUTPUT_FILE = BASE_DIR / "data" / "scraped.csv"

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

# ============================
# PARAMETER SCRAPING
# ============================
REQUEST_TIMEOUT = 15  # detik
SLEEP_BETWEEN_REQUESTS = 0.2  # jeda antar request biar sopan
MIN_WORDS = 40  # minimal jumlah kata supaya artikel dianggap valid

# User-Agent biar nggak keliatan terlalu "robot"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ============================
# BACA URL DARI urls.txt
# ============================
def load_urls(path: Path) -> list[str]:
    if not path.exists():
        raise FileNotFoundError(f"File URL tidak ditemukan: {path}")
    
    with path.open(encoding="utf-8") as f:
        raw = [line.strip() for line in f if line.strip()]
    
    # Hapus duplikat tapi jaga urutan
    seen = set()
    urls = []
    for u in raw:
        if u not in seen:
            seen.add(u)
            urls.append(u)
    
    print(f"[INFO] Total URL di {path.name}: {len(urls)}")
    return urls

# ============================
# FILTER: CUMA AMBIL URL ARTIKEL
# ============================
def is_article_url(url: str) -> bool:
    """
    Filter URL supaya:
    - Nggak ambil /copy/, /komentar/, /image/, /search/ (bukan artikel utama)
    - Nggak ambil homepage / halaman root
    """
    parsed = urlparse(url)
    path = parsed.path or ""

    # Buang path yang jelas bukan artikel
    blocked_fragments = [
        "/copy/",
        "/komentar/",
        "/image/",
        "/search/",
    ]
    if any(b in path for b in blocked_fragments):
        return False

    # Buang homepage dan path kosong
    if path in ["", "/"]:
        return False

    return True

# ============================
# EXTRACT TEKS DARI HTML
# ============================
def extract_article(url: str, html: str) -> dict:
    """
    Ambil title + content (gabungan <p>) dari halaman.
    Untuk detikTravel & KompasTravel, ambil semua <p> sudah cukup bagus.
    """
    soup = BeautifulSoup(html, "html.parser")

    # Title
    title = None

    # Coba dari <meta property="og:title">
    og_title = soup.find("meta", attrs={"property": "og:title"})
    if og_title and og_title.get("content"):
        title = og_title["content"].strip()

    # Kalau belum ketemu, pakai <title>
    if not title:
        title_tag = soup.find("title")
        if title_tag:
            title = title_tag.get_text(strip=True)

    if not title:
        title = "No Title"

    # Kumpulin semua <p>
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
    paragraphs = [p for p in paragraphs if p]
    content = "\n".join(paragraphs)

    # Hitung jumlah kata, buat ngecek minimal
    word_count = len(content.split())

    return {
        "url": url,
        "domain": urlparse(url).netloc,
        "title": title,
        "content": content,
        "word_count": word_count,
        "timestamp": time.time(),
    }

# ============================
# HTTP REQUEST
# ============================
def fetch(url: str) -> str | None:
    try:
        res = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        if res.status_code == 200:
            return res.text
        else:
            print(f"[STATUS {res.status_code}] {url}")
            return None
    except Exception as e:
        print(f"[ERROR] {url} -> {e}")
        return None

# ============================
# SCRAPE SATU URL
# ============================
def scrape_one(url: str) -> dict | None:
    print(f"[SCRAPE] {url}")
    html = fetch(url)
    if not html:
        return None

    article = extract_article(url, html)

    # Skip jika konten terlalu pendek
    if not article["content"] or article["word_count"] < MIN_WORDS:
        print(f"[SKIP - KONTEN PENDEK] {url}")
        return None

    print(f"[OK] {url} (≈ {article['word_count']} kata)")
    return article

# ============================
# MAIN: LOOP SCRAPING
# ============================
def scrape_all():
    urls = load_urls(URL_FILE)

    # Filter URL supaya cuma artikel utama
    article_urls = [u for u in urls if is_article_url(u)]
    print(f"[INFO] URL yang lolos filter pola artikel: {len(article_urls)}")

    results = []
    start_time = time.time()

    for idx, url in enumerate(article_urls, start=1):
        print(f"\n[{idx}/{len(article_urls)}]")
        data = scrape_one(url)
        if data:
            results.append(data)

        # Jeda kecil antar request
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    elapsed = time.time() - start_time
    print(f"\n[INFO] Scraping selesai dalam {elapsed:.2f} detik.")
    print(f"[INFO] Total artikel valid: {len(results)}")

    return results

# ============================
# EKSEKUSI UTAMA (BISA DI-RUN DI JUPYTER)
# ============================
if __name__ == "__main__":
    print(f"[INFO] Working Directory : {BASE_DIR}")
    print(f"[INFO] URL source       : {URL_FILE}")
    print(f"[INFO] Output CSV       : {OUTPUT_FILE}\n")

    results = scrape_all()

    if results:
        df = pd.DataFrame(results)
        df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print(f"\n[SUCCESS] Scraping selesai! File disimpan ke: {OUTPUT_FILE}")
        print(f"[INFO] Total data berhasil di-scrape: {len(results)}")

        # Preview
        print("\n[PREVIEW] 5 data pertama:")
        display(df.head())
    else:
        print("\n[WARNING] Tidak ada data yang berhasil di-scrape!")


[INFO] Working Directory : d:\KULYEAHUSK\Semester 5\Pi\mk\project\sipapa-sistem-pencarian-destinasi-pariwisata
[INFO] URL source       : d:\KULYEAHUSK\Semester 5\Pi\mk\project\sipapa-sistem-pencarian-destinasi-pariwisata\data\urls.txt
[INFO] Output CSV       : d:\KULYEAHUSK\Semester 5\Pi\mk\project\sipapa-sistem-pencarian-destinasi-pariwisata\data\scraped.csv

[INFO] Total URL di urls.txt: 10000
[INFO] URL yang lolos filter pola artikel: 4906

[1/4906]
[SCRAPE] https://travel.detik.com/domestic-destination
[SKIP - KONTEN PENDEK] https://travel.detik.com/domestic-destination

[2/4906]
[SCRAPE] https://travel.detik.com/international-destination
[SKIP - KONTEN PENDEK] https://travel.detik.com/international-destination

[3/4906]
[SCRAPE] https://travel.detik.com/travel-news
[SKIP - KONTEN PENDEK] https://travel.detik.com/travel-news

[4/4906]
[SCRAPE] https://travel.detik.com/travel-tips
[SKIP - KONTEN PENDEK] https://travel.detik.com/travel-tips

[5/4906]
[SCRAPE] https://travel.detik.com

Unnamed: 0,url,domain,title,content,word_count,timestamp
0,https://travel.kompas.com/read/2025/11/11/1900...,travel.kompas.com,Unik! Kamar Hotel Bertema Kereta Api di Jepang...,Unik! Kamar Hotel Bertema Kereta Api di Jepang...,561,1763449000.0
1,https://travel.kompas.com/read/2025/11/11/1925...,travel.kompas.com,"4 Rekomendasi Wisata di Banyuwangi, Cocok untu...","4 Rekomendasi Wisata di Banyuwangi, Cocok untu...",331,1763449000.0
2,https://travel.kompas.com/read/2025/05/03/1333...,travel.kompas.com,"Itinerary Seharian di Bromo Jawa Timur, dari S...","Itinerary Seharian di Bromo Jawa Timur, dari S...",515,1763449000.0
3,https://travel.kompas.com/read/2025/11/10/1851...,travel.kompas.com,"Promo Hari Pahlawan 2025, TMII Beri Diskon 30 ...","Promo Hari Pahlawan 2025, TMII Beri Diskon 30 ...",448,1763449000.0
4,https://travel.kompas.com/read/2025/11/11/1806...,travel.kompas.com,Gratis Tiket Wisata Trenggalek bagi Penumpang ...,Gratis Tiket Wisata Trenggalek bagi Penumpang ...,359,1763449000.0
