In [6]:
# =============================================
# Analisis Opini Publik tentang Wisata Bantul (Twitter)
# =============================================

import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tweepy
from wordcloud import WordCloud

import snscrape.modules.twitter as sntwitter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

# ------------------ Settings ------------------
MAX_TWEETS = 1        # batas tweet
QUERY = "(wisata bantul OR parangtritis OR hutan pinus OR mangunan OR parangkusumo OR pantai bantul) lang:id"
OUTPUT_RAW = "bantul_tweets_raw.csv"

# ------------------ Scraping dari Twitter ------------------
def scrape_twitter_api(query, max_tweets=1):
    client = tweepy.Client(bearer_token="AAAAAAAAAAAAAAAAAAAAAG205QEAAAAALf7jXB7mo1d5uYkdwRElM2sk%2Bpo%3DXjpv9ACBwRCsw0UQhJfcelYwc0GfjIouYewYu0n0bdUC8EAMvf")
    tweets = []
    for tweet in tweepy.Paginator(
        client.search_recent_tweets,
        query=query,
        max_results=1,
        tweet_fields=["created_at", "lang"],
    ).flatten(limit=max_tweets):
        time.sleep(3)
        if tweet.lang == "in":
            tweets.append({
                "source": "twitter",
                "url": f"https://twitter.com/i/web/status/{tweet.id}",
                "review_title": "",
                "review_text": tweet.text,
                "rating": None,
                "date": tweet.created_at
            })
    print(f"Total tweet diambil: {len(tweets)}")
    return tweets

# ------------------ Main: Collect data ------------------
docs = scrape_twitter_api(QUERY, MAX_TWEETS)
df = pd.DataFrame(docs)
df.to_csv(OUTPUT_RAW, index=False, encoding="utf-8-sig")
print("Data mentah disimpan ke:", OUTPUT_RAW)

# ------------------ Preprocessing ------------------
factory = StemmerFactory()
stemmer = factory.create_stemmer()

DEFAULT_STOPWORDS = set([
    "yang","di","ke","dari","dan","dengan","untuk","sebagai","atau","pada","ini",
    "itu","sangat","ada","kawasan","kota","kabupaten","bantul","yogyakarta",
    "yogya","jogja","yg","rt","via","aja","nih","ya","loh","dong","deh"
])

def preprocess_text(text, extra_stopwords=None):
    text = (text or "").lower()
    text = re.sub(r'https?://\S+|www\.\S+|\S+@\S+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [t for t in text.split() if len(t)>1]
    stopwords = DEFAULT_STOPWORDS.copy()
    if extra_stopwords:
        stopwords |= set(extra_stopwords)
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [stemmer.stem(t) for t in tokens]
    return " ".join(tokens)

df['clean'] = df['review_text'].fillna("").apply(preprocess_text)
df = df[df['clean'].str.strip().str.len() > 3].reset_index(drop=True)
print("Setelah pembersihan:", len(df), "dokumen")
df.to_csv("bantul_tweets_clean.csv", index=False, encoding="utf-8-sig")

# ------------------ TF-IDF & Top terms ------------------
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = tfidf.fit_transform(df['clean'])
terms = tfidf.get_feature_names_out()
means = np.asarray(X.mean(axis=0)).ravel()
top_n = 15
top_idx = means.argsort()[::-1][:top_n]
top_terms = terms[top_idx]
top_scores = means[top_idx]
top_df = pd.DataFrame({"term":top_terms, "score":top_scores})
print("\nTop TF-IDF terms:\n", top_df)

top_df.to_csv("bantul_tfidf_top_terms.csv", index=False, encoding="utf-8-sig")

# ------------------ WordCloud & Grafik ------------------
all_text = " ".join(df['clean'].tolist())
wc = WordCloud(width=1000, height=400, collocations=False, background_color="white").generate(all_text)

plt.figure(figsize=(12,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Opini Wisata Bantul (Twitter)")
plt.show()

plt.figure(figsize=(8,5))
plt.barh(top_df['term'][::-1], top_df['score'][::-1])
plt.xlabel("Mean TF-IDF")
plt.title("Top TF-IDF terms - Wisata Bantul (Twitter, Top {})".format(top_n))
plt.tight_layout()
plt.show()

df.to_csv("bantul_tweets_final.csv", index=False, encoding="utf-8-sig")
print("Hasil akhir disimpan di bantul_tweets_final.csv")


TooManyRequests: 429 Too Many Requests
Too Many Requests

In [2]:
pip install wordcloud

Collecting wordcloud
  Using cached wordcloud-1.9.4-cp311-cp311-win_amd64.whl (299 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install tweepy

Collecting tweepy
  Downloading tweepy-4.16.0-py3-none-any.whl (98 kB)
     ---------------------------------------- 98.8/98.8 kB 1.9 MB/s eta 0:00:00
Collecting oauthlib<4,>=3.2.0
  Downloading oauthlib-3.3.1-py3-none-any.whl (160 kB)
     -------------------------------------- 160.1/160.1 kB 4.8 MB/s eta 0:00:00
Collecting requests-oauthlib<3,>=1.2.0
  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-3.3.1 requests-oauthlib-2.0.0 tweepy-4.16.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
