# Import Library

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
seed = 0
np.random.seed(seed)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

import datetime as dt
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

!pip install sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

import nltk
nltk.download('punkt')
nltk.download('stopwords')

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load Dataset

In [3]:
ripiu_clean_df = pd.read_csv('webtoon_ripiu.csv')

print(f"Jumlah data: {len(ripiu_clean_df)}")
ripiu_clean_df.head()

Jumlah data: 15000


Unnamed: 0,userName,content,score,at
0,Pengguna Google,saya menggunakan aplikasi ini sudah lama. Vitu...,5,2025-03-29 11:32:43
1,Pengguna Google,rate 2 karena makin banyak cerita smut yang ma...,2,2025-04-10 07:42:24
2,Pengguna Google,Tampilan berandanya(home) aneh.. kayak ga bera...,4,2025-03-06 22:01:26
3,Pengguna Google,"Untuk aplikasinya memang bagus, dan banyak fit...",5,2025-03-05 08:00:52
4,Pengguna Google,padahal tampilan awal 'home' udah bagus dan te...,3,2025-03-01 00:39:18


# Pre Processing

jadi fungsi hps_teks() dan brshn_teks() gunanya untuk membersihkan dan menormalkan teks ulasan para readers. seperti menghapus mention, hastag, link, angka, tanda baca, dan simbol lainnya. lalu brshn_teks itu gunanya untuk mengubah teks jadi kecil, dipisah kata katanya, dibuang kata kata umumnya, terus di ubah ke bentuk dasar.

In [4]:
def hps_teks(teks):
    teks = re.sub(r'@[A-Za-z0-9]+', '', teks)
    teks = re.sub(r'#[A-Za-z0-9]+', '', teks)
    teks = re.sub(r'RT[\s]', '', teks)
    teks = re.sub(r"http\S+", '', teks)
    teks = re.sub(r'[0-9]+', '', teks)
    teks = re.sub(r'[^\w\s]', '', teks)
    teks = teks.replace('\n', ' ')
    teks = teks.translate(str.maketrans('', '', string.punctuation))
    return teks.strip()

def brshn_teks(teks):
    teks = teks.lower()
    teks = hps_teks(teks)
    tokens = word_tokenize(teks)

    stopword_ind = set(stopwords.words('indonesian'))
    stopword_eng = set(stopwords.words('english'))
    stopword_kustom = {'iyaa', 'gak', 'loh', 'ga', 'ya', 'kocak', 'oke'}
    all_stopwords = stopword_ind.union(stopword_eng, stopword_kustom)

    tokens_bersih = [t for t in tokens if t not in all_stopwords]

    stemmer = StemmerFactory().create_stemmer()
    tokens_stemmed = [stemmer.stem(kata) for kata in tokens_bersih]

    return ' '.join(tokens_stemmed)


In [5]:
slangwords = {
    "yg": "yang", "oke": "baik", "hnya": "hanya", "krna": "karena", "tpi": "tapi", "tp": "tapi", "trus": "terus", "trs": "terus", "nggak": "tidak", "ga": "tidak", "gak": "nggak",
    "gk": "nggak", "ngga": "nggak", "tdk": "tidak", "eps": "episode", "yaa": "ya", "yahh": "ya", "yaaa": "ya", "iyaah": "ya", "t_t": "sedih", "mmng": "memang","emg": "memang",
    "sangattt": "sangat", "iniii": "ini", "kadang2": "kadang-kadang","kdng": "kadang", "jangan2": "jangan-jangan", "lahh": "lah", "loo": "kamu", "lu": "kamu", "elo": "kamu",
    "lo": "kamu", "dehh": "deh", "gituuu": "gitu", "bngtt": "banget", "bgt": "banget", "bgtt": "banget", "bgttt": "banget", "bgtbanget": "banget", "bgtu": "banget itu",
    "anjir": "astaga", "anj": "anjing", "anjg": "anjing", "ajg": "anjing", "plis": "tolong", "pliss": "tolong", "plsss": "tolong", "plisss": "tolong", "btw": "ngomong-ngomong",
    "cuy": "teman", "gw": "aku", "gua": "aku", "gue": "aku", "dmn": "di mana", "dmana": "di mana", "dr": "dari", "sm": "sama", "sama2": "sama-sama", "aja": "saja", "ajah": "saja",
    "udh": "sudah", "dah": "sudah", "dlu": "dulu", "pdhl": "padahal", "masi": "masih", "msh": "masih", "knp": "kenapa", "gmn": "gimana", "gimn": "gimana", "dlm": "dalam", "cm": "cuma",
    "cma": "cuma", "bs": "bisa", "tetep": "tetap", "app": "aplikasi","apps": "aplikasi", "lemot": "lama", "smpe": "sampai", "sblm": "sebelum", "ngunduh": "download",
    "ngulang": "mengulang", "ttg": "tentang", "kayak": "seperti", "kyk": "seperti", "kalo": "kalau", "klo": "kalau", "jg": "juga", "si": "sih", "sihh": "sih", "tuh": "itu",
    "dong": "ya", "ntr": "nanti", "skrg": "sekarang", "sekrg": "sekarang", "napa": "kenapa", "ngapa": "kenapa", "ngapain": "apa", "wkwk": "tertawa", "wkwkwk": "tertawa",
    "wkwwk": "tertawa", "wle": "bercanda", "up": "update", "pgn": "ingin", "pengen": "ingin", "ngertii": "mengerti", "ngerti": "mengerti", "moga": "semoga", "brarti": "berarti",
    "bkn": "bukan","brg": "barang"
}


def normalize_slang(text):
    text = str(text).lower()
    for slang, formal in slangwords.items():
        text = re.sub(r'\b' + re.escape(slang) + r'\b', formal, text)
    return text

ripiu_clean_df['normalized_content'] = ripiu_clean_df['content'].apply(normalize_slang)

ripiu_clean_df[['content', 'normalized_content']].head()


Unnamed: 0,content,normalized_content
0,saya menggunakan aplikasi ini sudah lama. Vitu...,saya menggunakan aplikasi ini sudah lama. vitu...
1,rate 2 karena makin banyak cerita smut yang ma...,rate 2 karena makin banyak cerita smut yang ma...
2,Tampilan berandanya(home) aneh.. kayak ga bera...,tampilan berandanya(home) aneh.. seperti tidak...
3,"Untuk aplikasinya memang bagus, dan banyak fit...","untuk aplikasinya memang bagus, dan banyak fit..."
4,padahal tampilan awal 'home' udah bagus dan te...,padahal tampilan awal 'home' udah bagus dan te...


# Labelling

bagian labelling ini aku labelling otomatis, disini aku menggunakan kata kata yang relevan dan banyak di tulis di dalam ulasan oleh para readers. aku baca satu satu komen mereka lalu aku tarik kesimpulan apakah ini masuk ke positif, netral atau negatif. masih ada beberapa yang tidak terdeteksi.

In [6]:
positif_keywords = [
    "bagus", "mantap", "suka", "cepat", "enak", "keren", "hebat",
    "nyaman", "baik", "lancar", "recommended", "top", "senang", "kerenn",
    "cocok", "seru", "gemes", "bagus banget", "menghibur", "love", "asik",
    "menarik", "top banget", "favorit", "worth", "bagusss", "sukaaa",
    "lucu", "ngakak", "greget", "ga nyesel", "baguss", "seruu", "rekomendasi",
    "super", "terbaik", "bagus deh", "kocak", "nangis bahagia", "pinter",
    "bagus parah", "cerita keren", "alur bagus", "visual bagus", "epik",
    "plot twist", "lucu banget", "kereennn", "satisfying", "happy", "amazing",
    "puas", "full bintang", "100/10", "makasih", "terima kasih", "top markotop",
    "kerenn abiezzz", "ceritanya bagus","wajib download", "bagus bgt", "no iklan",
    "mantap jiwa", "kece", "worth it", "sangat bagus", "terharu", "komplit", "solid", "favoritku",
    "paling best!", "bermanfaat", "dihati", "bgsss", "lengkap", "inspirasi", "senang", "tidak ada keluhan",
    "jatuh hati", "memuaskan", "menghibur", "sempurna", "suami ku bertambah", "gantengnya pacar ku", "pacar gepeng ku"

]

negatif_keywords = [
    "jelek", "buruk", "lama", "parah", "gak suka", "tidak suka",
    "mengecewakan", "benci", "lelet", "jelek banget", "nggak banget", "payah",
    "error", "force close", "crash", "ngelag", "lemot", "bosen", "klise",
    "maksa", "berat", "koin mahal", "mahal banget", "kebanyakan iklan",
    "iklan mulu", "bikin kesel", "bikin nangis", "gaje", "ga jelas",
    "nggak nyambung", "ngerasa rugi", "ga sesuai", "kurang greget",
    "tidak logis", "dipaksain", "ending jelek", "ending menggantung",
    "basi", "norak", "repetitif", "kurang bagus", "lambat",
    "beli mahal", "kecewa", "tidak puas", "bayar tapi", "udah bayar",
    "ga worth", "ga puas", "nggak lucu", "nggak seru", "sangat kecewa",
    "alurnya ketebak", "koin terus", "terlalu banyak iklan", "susah login",
    "bikin kesel", "boros kuota", "sering keluar sendiri", "nggak work", "gagal terus",
    "iklan berlebihan", "minus", "vip", "iklan", "kenapa", "koin", "gilaa bgs", "jernih",
    "menyenangkan", "bodoh", "kesal", "gak masuk", "sedih", "ilang", "iklan", "gabisa", "dah lah",
    "uninstall", "unsur vulgar", "kurang lengkap", "gk pernah bisa", "kesalahan", "stuck", "ribet", "ilangin",
    "sering eror", "tidak terlalu lengkap", "ga ada", "kenapa sihh", "hapus", "ngecewain", "gagal", "kumat", "jenuh",
    "keluar sendiri", "sulit", "loading", "males", "aneh", "hadehh", "gak bisa", "keluar", "ngebug", "sampah", "koin mahal",
    "sering keluar sendiri", "iklan berlebihan", "meresahkan", "cape", "iklan alay", "pusing kepala", "bohong", "lelot", "ilang",

]

netral_keywords = [
    "biasa", "oke", "lumayan", "yaudah", "gitu aja", "standar",
    "so so", "cukup", "netral", "menengah", "gak jelek", "gak bagus",
    "seadanya", "gpp", "gimana ya", "not bad", "boleh lah", "slow",
    "cukupan", "ok", "ya", "nggak tau", "skip", "ya sih",
    "tergantung", "di tengah", "antara", "campur", "komentar netral",
    "gitu doang", "flat", "datar", "ga ngaruh", "nggak penting",
    "begitu doang", "cerita biasa", "oke lah", "lumayan bagus", "lumayan jelek",
    "so far so good", "boleh dicoba", "ya lumayan", "seadanya aja", "oke tapi",
    "kurang lebih", "antara suka dan enggak", "50:50", "baru", "nyoba", "gabut", "lengkap",
    "saran", "cuma", "baca", "gaada kritik", "fitur ss", "halu",

]


In [8]:
def keyword_label(text):
    text = text.lower()
    for kata in positif_keywords:
        if kata in text:
            return 'positif'
    for kata in negatif_keywords:
        if kata in text:
            return 'negatif'
    for kata in netral_keywords:
        if kata in text:
            return 'netral'
    return 'tidak diketahui'


In [17]:
ripiu_clean_df['label'] = ripiu_clean_df['normalized_content'].apply(keyword_label)
ripiu_clean_df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positif,12293
negatif,1966
netral,685
tidak diketahui,56


# Data Split

bagian ini bagian aku mengubah teks menjadi matrix angka dan membagi data test dan training

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = ripiu_clean_df['normalized_content']

vec = TfidfVectorizer(max_features=300, min_df=10, max_df=0.75)
hasil_tfidf = vec.fit_transform(X)

y = ripiu_clean_df['label']

X_train, X_test, y_train, y_test = train_test_split(
    hasil_tfidf, y, test_size=0.2, random_state=1
)

# Build Model

# Naive bayes

In [11]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

nb_model = BernoulliNB()

# trening modelnye
nb_model.fit(X_train.toarray(), y_train)

# prediksi hasil training dan testing
pred_train = nb_model.predict(X_train.toarray())
pred_test = nb_model.predict(X_test.toarray())

# itung akurasinye
train_acc = accuracy_score(y_train, pred_train)
test_acc = accuracy_score(y_test, pred_test)

# print hasil evaluasinya
print("Akurasi training (NB):", train_acc)
print("Akurasi testing  (NB):", test_acc)


Akurasi training (NB): 0.8868333333333334
Akurasi testing  (NB): 0.8833333333333333


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier()

# trening model
rf_model.fit(X_train.toarray(), y_train)

# prediksi hasil training dan testing
rf_train_pred = rf_model.predict(X_train.toarray())
rf_test_pred = rf_model.predict(X_test.toarray())

# itung akurasinye
rf_train_acc = accuracy_score(y_train, rf_train_pred)
rf_test_acc = accuracy_score(y_test, rf_test_pred)

# print hasil evaluasinye
print("Akurasi Training (RF):", rf_train_acc)
print("Akurasi Testing  (RF):", rf_test_acc)


Akurasi Training (RF): 0.9988333333333334
Akurasi Testing  (RF): 0.8706666666666667


# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg_model = LogisticRegression()

# trening model
logreg_model.fit(X_train.toarray(), y_train)

# prediksi hasil training dan testing
logreg_train_pred = logreg_model.predict(X_train.toarray())
logreg_test_pred = logreg_model.predict(X_test.toarray())

# itung akurasinye
logreg_train_acc = accuracy_score(y_train, logreg_train_pred)
logreg_test_acc = accuracy_score(y_test, logreg_test_pred)

# print hasil akurasinye
print("Akurasi Training (LogReg):", logreg_train_acc)
print("Akurasi Testing  (LogReg):", logreg_test_acc)


Akurasi Training (LogReg): 0.9085
Akurasi Testing  (LogReg): 0.8913333333333333


# Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_model = DecisionTreeClassifier()

# trening model
dt_model.fit(X_train.toarray(), y_train)

# prediksi hasil training dan testing
dt_pred_train = dt_model.predict(X_train.toarray())
dt_pred_test = dt_model.predict(X_test.toarray())

# itung akurasinye
dt_train_score = accuracy_score(y_train, dt_pred_train)
dt_test_score = accuracy_score(y_test, dt_pred_test)

# print hasil evaluasinye
print("Akurasi Train (Decision Tree):", dt_train_score)
print("Akurasi Test  (Decision Tree):", dt_test_score)

Akurasi Train (Decision Tree): 0.9988333333333334
Akurasi Test  (Decision Tree): 0.888


In [15]:
import pandas as pd

results_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'Random Forest', 'Logistic Regression', 'Decision Tree'],
    'Accuracy Train': [train_acc, rf_train_acc, logreg_train_acc, dt_train_score],
    'Accuracy Test': [test_acc, rf_test_acc, logreg_test_acc, dt_test_score]
})

accuracy_test_only = results_df[['Model', 'Accuracy Test']]
print(accuracy_test_only)

                 Model  Accuracy Test
0          Naive Bayes       0.883333
1        Random Forest       0.870667
2  Logistic Regression       0.891333
3        Decision Tree       0.888000


In [16]:
accuracy_test_sorted = accuracy_test_only.sort_values(by='Accuracy Test', ascending=False)

print(accuracy_test_sorted)


                 Model  Accuracy Test
2  Logistic Regression       0.891333
3        Decision Tree       0.888000
0          Naive Bayes       0.883333
1        Random Forest       0.870667


aku melakukan percobaan pada 4 model berbeda. setelah di lihat hasil yang paling tinggi di adalah menggunakan model Decision Tree.