In [2]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

# ============================
# PATH FILE (Compatible dengan Jupyter)
# ============================
BASE_DIR = Path.cwd()  # Gunakan current working directory
URL_FILE = BASE_DIR / "data" / "urls.txt"
OUTPUT_FILE = BASE_DIR / "data" / "scraped.csv"

# Pastikan folder data ada
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

# ============================
# USER AGENT (biar ga diblok)
# ============================
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}


# ============================
# SCRAPE SATU HALAMAN
# ============================
def scrape_page(url: str):
    """Ambil data dari satu halaman destinasi wisata."""
    print(f"Scraping: {url}")

    result = {
        "url": url,
        "title": None,
        "description": None,
        "location": None,
        "images": None,
    }

    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
    except Exception as e:
        print(f"[ERROR] Gagal mengakses {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # ============================
    #  SELEKTOR DATA (ubah sesuai website SIPAPA)
    # ============================

    # Judul destinasi
    title = soup.find("h1") or soup.find("h2")
    if title:
        result["title"] = title.get_text(strip=True)

    # Deskripsi wisata
    desc = soup.find("div", class_="description") or soup.find("p")
    if desc:
        result["description"] = desc.get_text(strip=True)

    # Lokasi
    loc = (
        soup.find("div", class_="location") or
        soup.find("span", class_="location") or
        soup.find("span", class_="lokasi")
    )
    if loc:
        result["location"] = loc.get_text(strip=True)

    # Ambil semua gambar
    images = []
    for img in soup.find_all("img"):
        src = img.get("src")
        if src and src.startswith("http"):
            images.append(src)

    if images:
        result["images"] = ", ".join(images)

    return result


# ============================
# SCRAPE SEMUA URL
# ============================
def scrape_all():
    print(f"[INFO] Membaca {URL_FILE} ...")
    
    # Cek apakah file urls.txt ada
    if not URL_FILE.exists():
        print(f"[ERROR] File {URL_FILE} tidak ditemukan!")
        print(f"[INFO] Buat file urls.txt di folder: {URL_FILE.parent}")
        return []

    with open(URL_FILE, "r", encoding="utf-8") as f:
        urls = [u.strip() for u in f.readlines() if u.strip()]

    print(f"[INFO] Total URL ditemukan: {len(urls)}")

    all_data = []

    for i, url in enumerate(urls, start=1):
        print(f"[{i}/{len(urls)}]")
        data = scrape_page(url)
        if data:
            all_data.append(data)

        time.sleep(0.7)  # delay supaya aman

    return all_data


# ============================
# MAIN PROGRAM
# ============================
if __name__ == "__main__":
    print("[INFO] Mulai scraping...")
    print(f"[INFO] Working Directory: {BASE_DIR}")
    
    results = scrape_all()

    if results:
        df = pd.DataFrame(results)
        df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
        print(f"\n[SUCCESS] Scraping selesai! File disimpan ke: {OUTPUT_FILE}")
        print(f"[INFO] Total data berhasil di-scrape: {len(results)}")
        
        # Preview data
        print("\n[PREVIEW] 5 data pertama:")
        print(df.head())
    else:
        print("\n[WARNING] Tidak ada data yang berhasil di-scrape!")

[INFO] Mulai scraping...
[INFO] Working Directory: d:\KULYEAHUSK\Semester 5\Pi\mk\project\sipapa-sistem-pencarian-destinasi-pariwisata
[INFO] Membaca d:\KULYEAHUSK\Semester 5\Pi\mk\project\sipapa-sistem-pencarian-destinasi-pariwisata\data\urls.txt ...
[INFO] Total URL ditemukan: 10000
[1/10000]
Scraping: https://travel.detik.com/
[2/10000]
Scraping: https://travel.detik.com/domestic-destination
[3/10000]
Scraping: https://travel.detik.com/international-destination
[4/10000]
Scraping: https://travel.detik.com/travel-news
[5/10000]
Scraping: https://travel.detik.com/travel-tips
[6/10000]
Scraping: https://travel.detik.com/travel-deals
[ERROR] Gagal mengakses https://travel.detik.com/travel-deals: 404 Client Error: Not Found for url: https://travel.detik.com/travel-deals
[7/10000]
Scraping: https://travel.detik.com/travel-food
[ERROR] Gagal mengakses https://travel.detik.com/travel-food: 404 Client Error: Not Found for url: https://travel.detik.com/travel-food
[8/10000]
Scraping: https://