In [6]:
import time
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

# ------------------ Settings ------------------
HEADLESS = True     # set False if you want to see browser
MAX_TRIPADVISOR_REVIEWS = 120   # cap to avoid too-long runs
PAUSE = 1.0         # polite delay between requests (seconds)
OUTPUT_CSV = "bantul_reviews.csv"

# ------------------ helpers ------------------
def safe_get(url, headers=None, timeout=10):
    try:
        r = requests.get(url, headers=headers or {"User-Agent":"Mozilla/5.0"}, timeout=timeout)
        return r.text
    except Exception as e:
        print("requests error:", e)
        return ""

def split_paragraphs(text):
    # split long article into paragraph-level docs
    paras = [p.strip() for p in re.split(r'\n+|\r+|\.\s+', text) if len(p.strip())>40]
    return paras

# ------------------ Site-specific scrapers (Requests + BS4) ------------------

def scrape_article_paragraphs(url, source_name=None, selector=None):
    """
    Generic article scraper that returns list of dicts with paragraph-level docs.
    If selector provided (CSS selector for main content), will use that; otherwise fallback to <p>.
    """
    html = safe_get(url)
    soup = BeautifulSoup(html, "html.parser")
    title = soup.title.get_text(strip=True) if soup.title else ""
    content = ""
    if selector:
        el = soup.select_one(selector)
        if el:
            content = " ".join([p.get_text(" ", strip=True) for p in el.find_all("p")])
    if not content:
        paras = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
        content = " ".join(paras)
    docs = []
    for para in split_paragraphs(content):
        docs.append({
            "source": source_name or re.sub(r'^https?://(www\.)?','', url.split('/')[2]) if url else "unknown",
            "url": url,
            "review_title": title,
            "review_text": para,
            "rating": None,
            "date": None
        })
    return docs

# ------------------ Detik-specific (uses class "detail__body-text" often) ------------------
def scrape_detik_article(url):
    # Detik structure: try "div.detail__body-text" first
    try:
        html = safe_get(url)
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
        content_div = soup.find("div", class_=re.compile("detail__body-text|detail_text"))
        if content_div:
            paras = [p.get_text(" ", strip=True) for p in content_div.find_all("p")]
            docs = []
            for p in paras:
                if len(p) > 40:
                    docs.append({
                        "source":"detik.com",
                        "url":url,
                        "review_title": title,
                        "review_text": p,
                        "rating": None,
                        "date": None
                    })
            return docs
        else:
            # fallback
            return scrape_article_paragraphs(url, source_name="detik.com")
    except Exception as e:
        print("detik scrape error:", e)
        return []

# ------------------ IDN Times & Kompas & Alodiatour & Yogyes generic ------------------
def scrape_generic_article(url):
    # attempt to detect main article container heuristically
    html = safe_get(url)
    soup = BeautifulSoup(html, "html.parser")
    title = soup.title.get_text(strip=True) if soup.title else ""
    # common containers
    cand = None
    for cls in ["read__content", "artikel-body", "article-content", "post-content", "entry-content", "td-post-content"]:
        cand = soup.find("div", class_=re.compile(cls))
        if cand:
            break
    if not cand:
        # fallback to main
        cand = soup.find("main") or soup
    paras = [p.get_text(" ", strip=True) for p in cand.find_all("p")]
    docs = []
    for p in paras:
        if len(p) > 40:
            docs.append({
                "source": re.sub(r'^https?://(www\.)?','', url.split('/')[2]),
                "url": url,
                "review_title": title,
                "review_text": p,
                "rating": None,
                "date": None
            })
    return docs

# ------------------ TripAdvisor (Selenium) ------------------
def init_selenium(headless=True):
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument("--headless=new")  # gunakan mode headless baru
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1200,800")
    chrome_options.add_argument("--lang=id-ID")

    # ✅ perbaikan utama di sini:
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def scrape_tripadvisor_city_reviews(city_url, max_reviews=100):
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    import time, re

    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--lang=id-ID")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.get(city_url)
    time.sleep(3)

    collected = []
    seen = set()

    # scroll & collect
    for _ in range(40):
        cards = driver.find_elements(By.CSS_SELECTOR, "div[data-automation='reviewCard']")
        for card in cards:
            try:
                text_el = card.find_element(By.CSS_SELECTOR, "span[class*='QewHA']")
                text = text_el.text.strip()
                if not text or text[:100] in seen:
                    continue
                seen.add(text[:100])
                collected.append({
                    "source": "tripadvisor",
                    "url": driver.current_url,
                    "review_title": "",
                    "review_text": text,
                    "rating": None,
                    "date": None
                })
                if len(collected) >= max_reviews:
                    break
            except Exception:
                continue

        driver.execute_script("window.scrollBy(0, 800);")
        time.sleep(2)
        if len(collected) >= max_reviews:
            break

    driver.quit()
    print(f"✅ Ditemukan {len(collected)} ulasan TripAdvisor")
    return collected


# ------------------ Main orchestration ------------------
def collect_all(seed_urls):
    """
    seed_urls: list of dicts: {"url":..., "type": "detik|tripadvisor|generic"}
    """
    all_docs = []
    for s in seed_urls:
        url = s["url"]
        typ = s.get("type","generic")
        print("Scraping:", url, "as", typ)
        if typ == "detik":
            docs = scrape_detik_article(url)
        elif typ == "tripadvisor":
            docs = scrape_tripadvisor_city_reviews(url, max_reviews=MAX_TRIPADVISOR_REVIEWS)
        else:
            docs = scrape_generic_article(url)
        print(" -> got", len(docs))
        all_docs.extend(docs)
        time.sleep(PAUSE)
    return all_docs

# ------------------ Example seeds (add more) ------------------
seed_urls = [
    {"url":"https://www.tripadvisor.co.id/Tourism-g2304084-Bantul_Yogyakarta_Region_Java-Vacations.html", "type":"tripadvisor"}
]

# Run collection
docs = collect_all(seed_urls)
print("TOTAL DOCUMENTS:", len(docs))

# If low amount, we will break article paragraphs further to increase doc count
# Save raw
df = pd.DataFrame(docs)
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print("Saved raw to", OUTPUT_CSV)

# ------------------ Preprocessing ------------------
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# minimal stopwords (add more domain-specific if required)
DEFAULT_STOPWORDS = set([
    "yang","di","ke","dari","dan","dengan","untuk","sebagai","atau","pada","ini",
    "itu","sangat","ada","kawasan","kota","kabupaten","bantul","yogyakarta",
    "yogya","jogja","yg"
])

def preprocess_text(text, extra_stopwords=None):
    text = (text or "").lower()
    text = re.sub(r'https?://\S+|www\.\S+|\S+@\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [t for t in text.split() if len(t)>1]
    stopwords = DEFAULT_STOPWORDS.copy()
    if extra_stopwords:
        stopwords |= set(extra_stopwords)
    tokens = [t for t in tokens if t not in stopwords]
    # stemming
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

# create preprocessed column
df['clean'] = df['review_text'].fillna("").apply(preprocess_text)
# remove empty rows
df = df[df['clean'].str.strip().str.len() > 3].reset_index(drop=True)
print("After cleaning:", len(df), "documents")
df.to_csv("bantul_reviews_clean.csv", index=False, encoding="utf-8-sig")

# ------------------ TF-IDF & Top terms ------------------
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = tfidf.fit_transform(df['clean'])
terms = tfidf.get_feature_names_out()
means = np.asarray(X.mean(axis=0)).ravel()
top_n = 15
top_idx = means.argsort()[::-1][:top_n]
top_terms = terms[top_idx]
top_scores = means[top_idx]
top_df = pd.DataFrame({"term":top_terms, "score":top_scores})
print("Top TF-IDF terms:\n", top_df)

# save tfidf top
top_df.to_csv("bantul_tfidf_top_terms.csv", index=False, encoding="utf-8-sig")

# ------------------ WordCloud & bar chart ------------------
all_text = " ".join(df['clean'].tolist())
wc = WordCloud(width=1000, height=400, collocations=False, background_color="white").generate(all_text)
plt.figure(figsize=(12,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Opini Wisata Bantul")
plt.show()

# bar chart
plt.figure(figsize=(8,5))
plt.barh(top_df['term'][::-1], top_df['score'][::-1])
plt.xlabel("Mean TF-IDF")
plt.title("Top TF-IDF terms - Bantul (Top {})".format(top_n))
plt.tight_layout()
plt.show()

# Save final dataframe and results
df.to_csv("bantul_reviews_final.csv", index=False, encoding="utf-8-sig")
print("Saved cleaned and final results.")


Scraping: https://travel.detik.com/domestic-destination/d-8108546/rekomendasi-5-desa-wisata-terbaik-di-bantul as detik
 -> got 25
Scraping: https://www.detik.com/jogja/plesir/d-7089360/10-tempat-wisata-hits-di-bantul-selain-pantai-dari-alam-hingga-religi as detik
 -> got 27
Scraping: https://jogja.idntimes.com/travel/destination/wisata-di-bantul-yang-lagi-hits-c1c2-01-llcyy-6nm29s as generic
 -> got 18
Scraping: https://travel.kompas.com/read/2022/05/19/192720927/15-wisata-bantul-yogyakarta-dengan-pemandangan-alam-instagramable?page=all as generic
 -> got 101
Scraping: https://alodiatour.com/blog/wisata-anak-bantul/ as generic
 -> got 42


KeyboardInterrupt: 