# Analisis Sentimen Ulasan Gojek

## Tahap 1 Data Acquisition

Mengambil ulasan aplikasi Gojek dari Google Play Store menggunakan library `google-play-scraper`. Label sentimen ditentukan berdasarkan rating bintang:
- Bintang 4–5 untuk **Positif**
- Bintang 3 untuk **Netral**
- Bintang 1–2 untuk **Negatif**

In [1]:
import pandas as pd
from google_play_scraper import reviews, Sort
import time

APP_ID = "com.gojek.app"
TARGET = 2000
OUTPUT = "data/ulasan_gojek_mentah.csv"

semua_ulasan = []
token = None

try:
    while len(semua_ulasan) < TARGET:
        sisa = TARGET - len(semua_ulasan)
        batch = min(200, sisa)

        hasil, token = reviews(
            APP_ID,
            lang="id",
            country="id",
            sort=Sort.NEWEST,
            count=batch,
            continuation_token=token
        )

        if not hasil:
            break

        semua_ulasan.extend(hasil)
        print(f"Mengambil ulasan: {len(semua_ulasan)}/{TARGET}")

        if token is None:
            break

        time.sleep(1)

except Exception as e:
    print(f"Error: {e}")

print(f"\nTotal ulasan berhasil diambil: {len(semua_ulasan)}")

Mengambil ulasan: 200/2000
Mengambil ulasan: 400/2000
Mengambil ulasan: 600/2000
Mengambil ulasan: 800/2000
Mengambil ulasan: 1000/2000
Mengambil ulasan: 1200/2000
Mengambil ulasan: 1400/2000
Mengambil ulasan: 1600/2000
Mengambil ulasan: 1800/2000
Mengambil ulasan: 2000/2000

Total ulasan berhasil diambil: 2000


In [None]:
daftar_data = []
for ulasan in semua_ulasan:
    daftar_data.append({
        "id_ulasan": ulasan.get("reviewId", ""),
        "nama_pengguna": ulasan.get("userName", ""),
        "isi_ulasan": ulasan.get("content", ""),
        "bintang": ulasan.get("score", 0),
        "tanggal_ulasan": ulasan.get("at", ""),
        "jumlah_like": ulasan.get("thumbsUpCount", 0),
    })

df = pd.DataFrame(daftar_data)

def tentukan_sentimen(bintang):
    if bintang >= 4:
        return "Positif"
    elif bintang == 3:
        return "Netral"
    else:
        return "Negatif"

df["sentimen"] = df["bintang"].apply(tentukan_sentimen)
df = df[df["isi_ulasan"].str.strip() != ""]
df = df.dropna(subset=["isi_ulasan"])

df.to_csv(OUTPUT, index=False, encoding="utf-8-sig")
print(f"Data disimpan ke: {OUTPUT}")
print(f"Total: {len(df)} ulasan")
print("\nDistribusi Sentimen:")
print(df["sentimen"].value_counts())

---
## Tahap 2 Text Cleaning & Pre-processing

Pipeline pembersihan teks yang dilakukan:
1. Lowercase
2. Hapus URL, mention, hashtag
3. Hapus emoji dan karakter non-ASCII
4. Hapus angka
5. Hapus tanda baca
6. Normalisasi kata slang/gaul
7. Stopword removal (PySastrawi)
8. Stemming (PySastrawi)

In [None]:
import re
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

INPUT  = "data/ulasan_gojek_mentah.csv"
OUTPUT = "data/ulasan_gojek_bersih.csv"

df = pd.read_csv(INPUT)
print(f"Data dimuat: {len(df)} baris")

In [None]:
print("Memuat Stemmer & StopWord Bahasa Indonesia...")
hapus_sw = StopWordRemoverFactory().create_stop_word_remover()
stemmer  = StemmerFactory().create_stemmer()

kamus_slang = {
    "gk": "tidak", "ga": "tidak", "gak": "tidak",
    "ngga": "tidak", "nggak": "tidak", "g": "tidak",
    "yg": "yang", "yng": "yang", "drpd": "daripada",
    "dgn": "dengan", "dg": "dengan", "sy": "saya",
    "sdh": "sudah", "udh": "sudah", "udah": "sudah",
    "blm": "belum", "blom": "belum", "blum": "belum",
    "dpt": "dapat", "gmn": "bagaimana", "gimana": "bagaimana",
    "hrs": "harus", "krn": "karena", "karna": "karena",
    "tp": "tapi", "tpi": "tapi", "ttg": "tentang",
    "dl": "dulu", "dlu": "dulu", "bnyk": "banyak",
    "byk": "banyak", "msh": "masih", "masi": "masih",
    "bgt": "banget", "banget": "sangat", "aja": "saja",
    "aj": "saja", "jg": "juga", "spt": "seperti",
    "ok": "oke", "lg": "lagi", "klo": "kalau",
    "kalo": "kalau", "bs": "bisa", "bsa": "bisa",
    "dr": "dari", "utk": "untuk", "u": "untuk",
    "tdk": "tidak", "pd": "pada", "km": "kamu",
    "lo": "kamu", "lu": "kamu", "gue": "saya",
    "gw": "saya", "abis": "habis", "lbh": "lebih",
    "lbih": "lebih", "susah": "sulit",
    "aplikasinya": "aplikasi", "appnya": "aplikasi",
}

def normalisasi_slang(teks):
    return " ".join([kamus_slang.get(k, k) for k in teks.split()])

def bersihkan_teks(teks):
    if pd.isna(teks) or str(teks).strip() == "":
        return ""
    teks = str(teks).lower()
    teks = re.sub(r"http\S+|www\.\S+", "", teks)
    teks = re.sub(r"@\w+|#\w+", "", teks)
    teks = teks.encode("ascii", "ignore").decode("ascii")
    teks = re.sub(r"\d+", "", teks)
    teks = re.sub(r"[^\w\s]", " ", teks)
    teks = re.sub(r"\s+", " ", teks).strip()
    teks = normalisasi_slang(teks)
    teks = hapus_sw.remove(teks)
    teks = stemmer.stem(teks)
    return teks.strip()

print("Fungsi pre-processing siap.")

In [None]:
print("Memproses teks... (mungkin beberapa menit)")
total = len(df)
hasil_bersih = []

for i, baris in df.iterrows():
    hasil_bersih.append(bersihkan_teks(baris["isi_ulasan"]))
    if (i + 1) % 200 == 0:
        print(f"  {i + 1}/{total} selesai...")

df["teks_bersih"] = hasil_bersih
df = df[df["teks_bersih"].str.strip() != ""]
df.to_csv(OUTPUT, index=False, encoding="utf-8-sig")
print(f"\nSelesai. {len(df)} baris disimpan ke {OUTPUT}")

---
## Tahap 3 Feature Engineering, Modeling & Evaluation

- **Feature Engineering:** TF-IDF Vectorizer (5.000 fitur, unigram + bigram)
- **Modeling:** Logistic Regression
- **Split data:** 80% latih, 20% uji (stratified)
- **Evaluasi:** Accuracy, Precision, Recall, F1-Score, Confusion Matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import json
import os
import warnings
warnings.filterwarnings("ignore")

os.makedirs("model", exist_ok=True)
os.makedirs("gambar", exist_ok=True)

df = pd.read_csv("data/ulasan_gojek_bersih.csv")
df = df.dropna(subset=["teks_bersih", "sentimen"])
df = df[df["teks_bersih"].str.strip() != ""]

distribusi = df["sentimen"].value_counts()
print(f"Data: {len(df)} baris")
print("\nDistribusi Sentimen:")
print(distribusi)

In [None]:
X = df["teks_bersih"]
y = df["sentimen"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vektorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)

X_train_tfidf = vektorizer.fit_transform(X_train)
X_test_tfidf  = vektorizer.transform(X_test)

print(f"Data latih : {len(X_train)} sampel")
print(f"Data uji   : {len(X_test)} sampel")
print(f"Jumlah fitur TF-IDF : {X_train_tfidf.shape[1]}")

In [None]:
model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    solver="lbfgs",
    multi_class="multinomial",
    random_state=42
)
model.fit(X_train_tfidf, y_train)
print("Model selesai dilatih.")

### Evaluasi Model

In [None]:
prediksi = model.predict(X_test_tfidf)
akurasi  = accuracy_score(y_test, prediksi)

print(f"Akurasi: {akurasi * 100:.2f}%")
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, prediksi, target_names=model.classes_))

laporan = classification_report(y_test, prediksi, target_names=model.classes_, output_dict=True)
pd.DataFrame(laporan).transpose().to_csv("data/laporan_evaluasi.csv", encoding="utf-8-sig")

### Visualisasi Confusion Matrix

In [None]:
label_kelas = model.classes_
cm = confusion_matrix(y_test, prediksi, labels=label_kelas)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_kelas, yticklabels=label_kelas,
            linewidths=0.5, ax=ax)
ax.set_title("Confusion Matrix — Analisis Sentimen Ulasan Gojek", fontsize=14, fontweight="bold", pad=15)
ax.set_xlabel("Prediksi", fontsize=12)
ax.set_ylabel("Aktual", fontsize=12)
plt.tight_layout()
plt.savefig("gambar/confusion_matrix.png", dpi=150, bbox_inches="tight")
plt.show()
print("Confusion matrix disimpan ke: gambar/confusion_matrix.png")

### Visualisasi Distribusi Sentimen

In [None]:
palet = {"Positif": "#2ecc71", "Netral": "#f39c12", "Negatif": "#e74c3c"}
warna_bar = [palet.get(k, "#95a5a6") for k in distribusi.index]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle("Distribusi Sentimen Ulasan Gojek", fontsize=14, fontweight="bold")

batang = ax1.bar(distribusi.index, distribusi.values, color=warna_bar, edgecolor="white", linewidth=1.5)
ax1.set_title("Jumlah Ulasan per Sentimen")
ax1.set_xlabel("Sentimen")
ax1.set_ylabel("Jumlah Ulasan")
for p in batang:
    ax1.annotate(f"{p.get_height():,}",
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha="center", va="bottom", fontweight="bold")

ax2.pie(distribusi.values, labels=distribusi.index, colors=warna_bar,
        autopct="%1.1f%%", startangle=90,
        wedgeprops=dict(edgecolor="white", linewidth=2))
ax2.set_title("Persentase Sentimen")

plt.tight_layout()
plt.savefig("gambar/distribusi_sentimen.png", dpi=150, bbox_inches="tight")
plt.show()
print("Distribusi sentimen disimpan ke: gambar/distribusi_sentimen.png")

### Visualisasi Word Cloud per Sentimen

In [None]:
fig, sumbu = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("Word Cloud Ulasan Gojek per Sentimen", fontsize=14, fontweight="bold")

wc_config = {
    "Positif": {"colormap": "Greens",  "ax": sumbu[0]},
    "Netral":  {"colormap": "Oranges", "ax": sumbu[1]},
    "Negatif": {"colormap": "Reds",    "ax": sumbu[2]},
}

for label, cfg in wc_config.items():
    teks = " ".join(df[df["sentimen"] == label]["teks_bersih"].tolist())
    if teks.strip():
        wc = WordCloud(width=500, height=300, background_color="white",
                       colormap=cfg["colormap"], max_words=80,
                       collocations=False).generate(teks)
        cfg["ax"].imshow(wc, interpolation="bilinear")
    cfg["ax"].set_title(f"Sentimen: {label}", fontsize=12, fontweight="bold")
    cfg["ax"].axis("off")

plt.tight_layout()
plt.savefig("gambar/wordcloud_sentimen.png", dpi=150, bbox_inches="tight")
plt.show()
print("Word cloud disimpan ke: gambar/wordcloud_sentimen.png")

### Simpan Model & Metadata

In [None]:
joblib.dump(model,      "model/model_sentimen.pkl")
joblib.dump(vektorizer, "model/vektorizer_tfidf.pkl")

metadata = {
    "akurasi":        akurasi,
    "jumlah_data":    len(df),
    "jumlah_latih":   len(X_train),
    "jumlah_uji":     len(X_test),
    "jumlah_fitur":   X_train_tfidf.shape[1],
    "kelas":          list(model.classes_),
    "distribusi":     distribusi.to_dict(),
    "presisi_positif": laporan.get("Positif", {}).get("precision", 0),
    "recall_positif":  laporan.get("Positif", {}).get("recall", 0),
    "f1_positif":      laporan.get("Positif", {}).get("f1-score", 0),
    "presisi_negatif": laporan.get("Negatif", {}).get("precision", 0),
    "recall_negatif":  laporan.get("Negatif", {}).get("recall", 0),
    "f1_negatif":      laporan.get("Negatif", {}).get("f1-score", 0),
    "presisi_netral":  laporan.get("Netral", {}).get("precision", 0),
    "recall_netral":   laporan.get("Netral", {}).get("recall", 0),
    "f1_netral":       laporan.get("Netral", {}).get("f1-score", 0),
}

with open("model/metadata_model.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print("Model  → model/model_sentimen.pkl")
print("Vektor → model/vektorizer_tfidf.pkl")
print("Meta   → model/metadata_model.json")
print(f"\nAkurasi akhir: {akurasi * 100:.2f}%")