**Import Library**

In [3]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rizki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rizki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocessing

In [4]:
reviews_df = pd.read_csv('ulasan_flip.csv')
reviews_df.shape
reviews_df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,01b6a990-f9cb-4288-9cd5-8f8b6ea6982b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Sangat lama banget ya dalam pengecekan nya, ti...",5,148,3.28.1,2025-02-19 12:10:40,,,3.28.1
1,4bf2ac74-de45-4334-bd7f-e0ffc8f9f248,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Kok sekarang lewat BCA sih,, otomatis semua tr...",3,3,3.29.0,2025-02-28 23:01:54,"Halo, Kak Rudi Harianto\n\nMohon maaf atas ket...",2025-03-03 08:22:43,3.29.0
2,371e0ba3-5c65-42c7-ab3c-147e0fe1f012,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sangat kecewa. Dari jam 11 kemaren siang trans...,1,0,3.29.0,2025-03-04 12:26:00,"Halo, Kak Cikal Damar\n\nMohon maaf atas ketid...",2025-03-06 08:25:09,3.29.0
3,d75399ca-d3ca-4943-b5ea-6f4a284296e9,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Tidak menyarankan membayar kartu kredit memaka...,5,1,3.29.0,2025-03-04 00:09:27,,,3.29.0
4,75ec2f91-ec42-415c-a867-de1e16a84f39,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Kenapa top up google play ribet sekali sekaran...,4,1,3.29.0,2025-03-02 14:02:46,,,3.29.0


In [5]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121500 entries, 0 to 121499
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              121500 non-null  object
 1   userName              121500 non-null  object
 2   userImage             121500 non-null  object
 3   content               121500 non-null  object
 4   score                 121500 non-null  int64 
 5   thumbsUpCount         121500 non-null  int64 
 6   reviewCreatedVersion  109042 non-null  object
 7   at                    121500 non-null  object
 8   replyContent          18684 non-null   object
 9   repliedAt             18684 non-null   object
 10  appVersion            109042 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.2+ MB


In [6]:
reviews_df.drop(['reviewId','appVersion','repliedAt','replyContent','userImage','reviewCreatedVersion','userName','thumbsUpCount','at'], axis=1, inplace=True)

In [7]:
clean_df = reviews_df.dropna()

In [8]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121500 entries, 0 to 121499
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   content  121500 non-null  object
 1   score    121500 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [9]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')
    return text

def casefoldingText(text):
    text = text.lower()
    return text

def tokenizingText(text):
    text = word_tokenize(text)
    return text

def filteringText(text):
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

def toSentence(list_words):
    sentence = ' '.join(word for word in list_words)
    return sentence

In [10]:
slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal", "plisss": "tolong", "bgttt": "banget", "indo": "indonesia", "bgtt": "banget", "ad": "ada", "plis": "tolong", "pls": "tolong", "cr": "sumber", "cod": "bayar ditempat", "adlh": "adalah", "afaik": "as far as i know", "ahaha": "haha", "aj": "saja", "ajep-ajep": "dunia gemerlap", "ak": "saya", "akika": "aku", "akkoh": "aku", "akuwh": "aku", "alay": "norak", "alow": "halo", "ambilin": "ambilkan", "ancur": "hancur", "anjrit": "anjing", "anter": "antar", "ap2": "apa-apa", "apasih": "apa sih", "apes": "sial", "aps": "apa", "aq": "saya", "aquwh": "aku", "asbun": "asal bunyi", "aseekk": "asyik", "asekk": "asyik", "asem": "asam", "aspal": "asli tetapi palsu", "astul": "asal tulis", "ato": "atau", "au ah": "tidak mau tahu", "awak": "saya", "ay": "sayang", "ayank": "sayang", "b4": "sebelum", "bakalan": "akan", "bandes": "bantuan desa", "bangedh": "banget", "banpol": "bantuan polisi", "banpur": "bantuan tempur", "basbang": "basi", "bcanda": "bercanda", "bdg": "bandung", "begajulan": "nakal", "beliin": "belikan", "bencong": "banci", "bentar": "sebentar", "ber3": "bertiga", "beresin": "membereskan", "bete": "bosan", "beud": "banget", "bg": "abang", "bgmn": "bagaimana", "bgt": "banget", "bijimane": "bagaimana", "bintal": "bimbingan mental", "bkl": "akan", "bknnya": "bukannya", "blegug": "bodoh", "blh": "boleh", "bln": "bulan", "blum": "belum", "bnci": "benci", "bnran": "yang benar", "bodor": "lucu", "bokap": "ayah", "boker": "buang air besar", "bokis": "bohong", "boljug": "boleh juga", "bonek": "bocah nekat", "boyeh": "boleh", "br": "baru", "brg": "bareng", "bro": "saudara laki-laki", "bru": "baru", "bs": "bisa", "bsen": "bosan", "bt": "buat", "btw": "ngomong-ngomong", "buaya": "tidak setia", "bubbu": "tidur", "bubu": "tidur", "bumil": "ibu hamil", "bw": "bawa", "bwt": "buat", "byk": "banyak", "byrin": "bayarkan", "cabal": "sabar", "cadas": "keren", "calo": "makelar", "can": "belum", "capcus": "pergi", "caper": "cari perhatian", "ce": "cewek", "cekal": "cegah tangkal", "cemen": "penakut", "cengengesan": "tertawa", "cepet": "cepat", "cew": "cewek", "chuyunk": "sayang", "cimeng": "ganja", "cipika cipiki": "cium pipi kanan cium pipi kiri", "ciyh": "sih", "ckepp": "cakep", "ckp": "cakep", "cmiiw": "correct me if i'm wrong", "cmpur": "campur", "cong": "banci", "conlok": "cinta lokasi", "cowwyy": "maaf", "cp": "siapa", "cpe": "capek", "cppe": "capek", "cucok": "cocok", "cuex": "cuek", "cumi": "Cuma miscall", "cups": "culun", "curanmor": "pencurian kendaraan bermotor", "curcol": "curahan hati colongan", "cwek": "cewek", "cyin": "cinta", "d": "di", "dah": "deh", "dapet": "dapat", "de": "adik", "dek": "adik", "demen": "suka", "deyh": "deh", "dgn": "dengan", "diancurin": "dihancurkan", "dimaafin": "dimaafkan", "dimintak": "diminta", "disono": "di sana", "dket": "dekat", "dkk": "dan kawan-kawan", "dll": "dan lain-lain", "dlu": "dulu", "dngn": "dengan", "dodol": "bodoh", "doku": "uang", "dongs": "dong", "dpt": "dapat", "dri": "dari", "drmn": "darimana", "drtd": "dari tadi", "dst": "dan seterusnya", "dtg": "datang", "duh": "aduh", "duren": "durian", "ed": "edisi", "egp": "emang gue pikirin", "eke": "aku", "elu": "kamu", "emangnya": "memangnya", "emng": "memang", "endak": "tidak", "enggak": "tidak", "envy": "iri", "ex": "mantan", "fax": "facsimile", "fifo": "first in first out", "folbek": "follow back", "fyi": "sebagai informasi", "gaada": "tidak ada uang", "gag": "tidak", "gaje": "tidak jelas", "gak papa": "tidak apa-apa", "gan": "juragan", "gaptek": "gagap teknologi", "gatek": "gagap teknologi", "gawe": "kerja", "gbs": "tidak bisa", "gebetan": "orang yang disuka", "geje": "tidak jelas", "gepeng": "gelandangan dan pengemis", "ghiy": "lagi", "gile": "gila", "gimana": "bagaimana", "gino": "gigi nongol", "githu": "gitu", "gj": "tidak jelas", "gmana": "bagaimana", "gn": "begini", "goblok": "bodoh", "golput": "golongan putih", "gowes": "mengayuh sepeda", "gpny": "tidak punya", "gr": "gede rasa", "gretongan": "gratisan", "gtau": "tidak tahu", "gua": "saya", "guoblok": "goblok", "gw": "saya", "ha": "tertawa", "haha": "tertawa", "hallow": "halo", "hankam": "pertahanan dan keamanan", "hehe": "he", "helo": "halo", "hey": "hai", "hlm": "halaman", "hny": "hanya", "hoax": "isu bohong", "hr": "hari", "hrus": "harus", "hubdar": "perhubungan darat", "huff": "mengeluh", "hum": "rumah", "humz": "rumah", "ilang": "hilang", "ilfil": "tidak suka", "imho": "in my humble opinion", "imoetz": "imut", "item": "hitam", "itungan": "hitungan", "iye": "iya", "ja": "saja", "jadiin": "jadi", "jaim": "jaga image", "jayus": "tidak lucu", "jdi": "jadi", "jem": "jam", "jga": "juga", "jgnkan": "jangankan", "jir": "anjing", "jln": "jalan", "jomblo": "tidak punya pacar", "jubir": "juru bicara", "jutek": "galak", "k": "ke", "kab": "kabupaten", "kabor": "kabur", "kacrut": "kacau", "kadiv": "kepala divisi", "kagak": "tidak", "kalo": "kalau", "kampret": "sialan", "kamtibmas": "keamanan dan ketertiban masyarakat", "kamuwh": "kamu", "kanwil": "kantor wilayah", "karna": "karena", "kasubbag": "kepala subbagian", "katrok": "kampungan", "kayanya": "kayaknya", "kbr": "kabar", "kdu": "harus", "kec": "kecamatan", "kejurnas": "kejuaraan nasional", "kekeuh": "keras kepala", "kel": "kelurahan", "kemaren": "kemarin", "kepengen": "mau", "kepingin": "mau", "kepsek": "kepala sekolah", "kesbang": "kesatuan bangsa", "kesra": "kesejahteraan rakyat", "ketrima": "diterima", "kgiatan": "kegiatan", "kibul": "bohong", "kimpoi": "kawin", "kl": "kalau", "klianz": "kalian", "kloter": "kelompok terbang", "klw": "kalau", "km": "kamu", "kmps": "kampus", "kmrn": "kemarin", "knal": "kenal", "knp": "kenapa", "kodya": "kota madya", "komdis": "komisi disiplin", "komsov": "komunis sovyet", "kongkow": "kumpul bareng teman-teman", "kopdar": "kopi darat", "korup": "korupsi", "kpn": "kapan", "krenz": "keren", "krm": "kirim", "kt": "kita", "ktmu": "ketemu", "ktr": "kantor", "kuper": "kurang pergaulan", "kw": "imitasi", "kyk": "seperti", "la": "lah", "lam": "salam", "lamp": "lampiran", "lanud": "landasan udara", "latgab": "latihan gabungan", "lebay": "berlebihan", "leh": "boleh", "lelet": "lambat", "lemot": "lambat", "lgi": "lagi", "lgsg": "langsung", "liat": "lihat", "litbang": "penelitian dan pengembangan", "lmyn": "lumayan", "lo": "kamu", "loe": "kamu", "lola": "lambat berfikir", "louph": "cinta", "low": "kalau", "lp": "lupa", "luber": "langsung, umum, bebas, dan rahasia", "luchuw": "lucu", "lum": "belum", "luthu": "lucu", "lwn": "lawan", "maacih": "terima kasih", "mabal": "bolos", "macem": "macam", "macih": "masih", "maem": "makan", "magabut": "makan gaji buta", "maho": "homo", "mak jang": "kaget", "maksain": "memaksa", "malem": "malam", "mam": "makan", "maneh": "kamu", "maniez": "manis", "mao": "mau", "masukin": "masukkan", "melu": "ikut", "mepet": "dekat sekali", "mgu": "minggu", "migas": "minyak dan gas bumi", "mikol": "minuman beralkohol", "miras": "minuman keras", "mlah": "malah", "mngkn": "mungkin", "mo": "mau", "mokad": "mati", "moso": "masa", "mpe": "sampai", "msk": "masuk", "mslh": "masalah", "mt": "makan teman", "mubes": "musyawarah besar", "mulu": "melulu", "mumpung": "selagi", "munas": "musyawarah nasional", "muntaber": "muntah dan berak", "musti": "mesti", "muupz": "maaf", "mw": "now watching", "n": "dan", "nanam": "menanam", "nanya": "bertanya", "napa": "kenapa", "napi": "narapidana", "napza": "narkotika, alkohol, psikotropika, dan zat adiktif ", "narkoba": "narkotika, psikotropika, dan obat terlarang", "nasgor": "nasi goreng", "nda": "tidak", "ndiri": "sendiri", "ne": "ini", "nekolin": "neokolonialisme", "nembak": "menyatakan cinta", "ngabuburit": "menunggu berbuka puasa", "ngaku": "mengaku", "ngambil": "mengambil", "nganggur": "tidak punya pekerjaan", "ngapah": "kenapa", "ngaret": "terlambat", "ngasih": "memberikan", "ngebandel": "berbuat bandel", "ngegosip": "bergosip", "ngeklaim": "mengklaim", "ngeksis": "menjadi eksis", "ngeles": "berkilah", "ngelidur": "menggigau", "ngerampok": "merampok", "ngga": "tidak", "ngibul": "berbohong", "ngiler": "mau", "ngiri": "iri", "ngisiin": "mengisikan", "ngmng": "bicara", "ngomong": "bicara", "ngubek2": "mencari-cari", "ngurus": "mengurus", "nie": "ini", "nih": "ini", "niyh": "nih", "nmr": "nomor", "nntn": "nonton", "nobar": "nonton bareng", "np": "now playing", "ntar": "nanti", "ntn": "nonton", "numpuk": "bertumpuk", "nutupin": "menutupi", "nyari": "mencari", "nyekar": "menyekar", "nyicil": "mencicil", "nyoblos": "mencoblos", "nyokap": "ibu", "ogah": "tidak mau", "ol": "online", "ongkir": "ongkos kirim", "oot": "out of topic", "org2": "orang-orang", "ortu": "orang tua", "otda": "otonomi daerah", "otw": "on the way, sedang di jalan", "pacal": "pacar", "pake": "pakai", "pala": "kepala", "pansus": "panitia khusus", "parpol": "partai politik", "pasutri": "pasangan suami istri", "pd": "pada", "pede": "percaya diri", "pelatnas": "pemusatan latihan nasional", "pemda": "pemerintah daerah", "pemkot": "pemerintah kota", "pemred": "pemimpin redaksi", "penjas": "pendidikan jasmani", "perda": "peraturan daerah", "perhatiin": "perhatikan", "pesenan": "pesanan", "pgang": "pegang", "pi": "tapi", "pilkada": "pemilihan kepala daerah", "pisan": "sangat", "pk": "penjahat kelamin", "plg": "paling", "pmrnth": "pemerintah", "polantas": "polisi lalu lintas", "ponpes": "pondok pesantren", "pp": "pulang pergi", "prg": "pergi", "prnh": "pernah", "psen": "pesan", "pst": "pasti", "pswt": "pesawat", "pw": "posisi nyaman", "qmu": "kamu", "rakor": "rapat koordinasi", "ranmor": "kendaraan bermotor", "re": "reply", "ref": "referensi", "rehab": "rehabilitasi", "rempong": "sulit", "repp": "balas",  "rhs": "rahasia", "rmh": "rumah", "ru": "baru", "ruz": "terus", "saia": "saya", "salting": "salah tingkah", "sampe": "sampai", "samsek": "sama sekali", "sapose": "siapa", "satpam": "satuan pengamanan", "sbb": "sebagai berikut", "sbh": "sebuah", "sbnrny": "sebenarnya", "scr": "secara", "sdgkn": "sedangkan", "sdkt": "sedikit", "se7": "setuju", "sebelas dua belas": "mirip", "sj": "saja", "skalian": "sekalian", "sklh": "sekolah", "skt": "sakit", "slesai": "selesai", "sll": "selalu", "slma": "selama", "slsai": "selesai", "smpt": "sempat", "smw": "semua", "sndiri": "sendiri", "songong": "sombong", "sory": "maaf", "sotoy": "sok tahu", "spa": "siapa", "sppa": "siapa", "spt": "seperti", "stiap": "setiap", "stlh": "setelah", "suk": "masuk", "sumpek": "sempit", "syg": "sayang", "t4": "tempat", "tajir": "kaya", "tau": "tahu", "taw": "tahu", "td": "tadi", "tdk": "tidak", "teh": "kakak perempuan", "telat": "terlambat", "telmi": "telat berpikir", "temen": "teman", "tggu": "tunggu", "tgu": "tunggu", "thankz": "terima kasih", "thn": "tahun", "tks": "terima kasih", "tlp": "telepon", "tls": "tulis", "tmbah": "tambah", "tmen2": "teman-teman", "tmpah": "tumpah", "tmpt": "tempat", "tngu": "tunggu", "tnyta": "ternyata", "tokai": "tai", "toserba": "toko serba ada", "tpi": "tapi", "trdhulu": "terdahulu", "trima": "terima kasih", "trm": "terima", "trs": "terus", "trutama": "terutama", "ts": "penulis", "tst": "tahu sama tahu", "ttg": "tentang", "tuch": "tuh", "tuir": "tua", "tw": "tahu", "u": "kamu", "ud": "sudah", "udah": "sudah", "ujg": "ujung", "ul": "ulangan", "unyu": "lucu", "uplot": "unggah", "urang": "saya", "usah": "perlu", "utk": "untuk",  "wat": "buat", "wkt": "waktu", "wtf": "what the fuck", "xixixi": "tertawa", "ya": "iya", "yap": "iya", "yaudah": "ya sudah", "yawdah": "ya sudah", "yg": "yang", "yl": "yang lain", "yo": "iya", "yowes": "ya sudah", "yup": "iya", "7an": "tujuan", "ababil": "abg labil", "acc": "accord", "adlah": "adalah", "adoh": "aduh", "aha": "tertawa", "aing": "saya", "aja": "saja", "ajj": "saja", "aka": "dikenal juga sebagai", "akko": "aku", "akku": "aku", "akyu": "aku", "aljasa": "asal jadi saja", "ama": "sama", "ambl": "ambil", "anjir": "anjing", "ank": "anak", "ap": "apa", "apaan": "apa", "ape": "apa", "aplot": "unggah", "apva": "apa", "aqu": "aku", "asap": "sesegera mungkin", "aseek": "asyik", "asek": "asyik", "aseknya": "asyiknya", "asoy": "asyik", "ath": "kalau begitu", "atuh": "kalau begitu", "ava": "avatar", "aws": "awas", "ayang": "sayang", "ayok": "ayo", "bacot": "banyak bicara", "bales": "balas", "bangdes": "pembangunan desa", "bangkotan": "tua", "banpres": "bantuan presiden", "bansarkas": "bantuan sarana kesehatan", "bazis": "badan amal, zakat, infak, dan sedekah", "bcoz": "karena", "beb": "sayang", "bejibun": "banyak", "belom": "belum", "bener": "benar", "ber2": "berdua", "berdikari": "berdiri di atas kaki sendiri", "bet": "banget", "beti": "beda tipis", "beut": "banget", "bgd": "banget", "bgs": "bagus", "bhubu": "tidur", "bimbuluh": "bimbingan dan penyuluhan", "bisi": "kalau-kalau", "bkn": "bukan", "bl": "beli", "blg": "bilang", "blm": "belum", "bls": "balas", "bnchi": "benci", "bngung": "bingung", "bnyk": "banyak", "bohay": "badan aduhai", "bole": "boleh", "bolot": "bodoh", "bonyok": "ayah ibu", "bpk": "bapak", "brb": "segera kembali", "brngkt": "berangkat", "brp": "berapa", "brur": "saudara laki-laki", "bsa": "bisa", "bsk": "besok", "bu_bu": "tidur", "bubarin": "bubarkan", "buber": "buka bersama", "bujubune": "luar biasa", "buser": "buru sergap", "bwhn": "bawahan", "byar": "bayar", "byr": "bayar", "c8": "chat", "cabut": "pergi", "caem": "cakep", "cama-cama": "sama-sama", "cangcut": "celana dalam", "cape": "capek", "caur": "jelek", "cekak": "tidak ada uang", "cekidot": "coba lihat", "cemplungin": "cemplungkan", "ceper": "pendek", "ceu": "kakak perempuan", "cewe": "cewek", "cibuk": "sibuk", "cin": "cinta", "ciye": "cie", "ckck": "ck", "clbk": "cinta lama bersemi kembali", "cmpr": "campur", "cnenk": "senang", "congor": "mulut", "cow": "cowok", "coz": "karena", "cpa": "siapa", "gokil": "gila", "gombal": "suka merayu", "gpl": "tidak pakai lama", "gpp": "tidak apa-apa", "gretong": "gratis", "gt": "begitu", "gtw": "tidak tahu", "gue": "saya", "guys": "teman-teman", "gws": "cepat sembuh", "haghaghag": "tertawa", "hakhak": "tertawa", "handak": "bahan peledak", "hansip": "pertahanan sipil", "hellow": "halo", "helow": "halo", "hi": "hai", "hlng": "hilang", "hnya": "hanya", "houm": "rumah", "hrs": "harus", "hubad": "hubungan angkatan darat", "hubla": "perhubungan laut", "huft": "mengeluh", "humas": "hubungan masyarakat", "idk": "saya tidak tahu", "ilfeel": "tidak suka", "imba": "jago sekali", "imoet": "imut", "info": "informasi", "itung": "hitung", "isengin": "bercanda", "iyala": "iya lah", "iyo": "iya", "jablay": "jarang dibelai", "jadul": "jaman dulu", "jancuk": "anjing", "jd": "jadi", "jdikan": "jadikan", "jg": "juga", "jgn": "jangan", "jijay": "jijik", "jkt": "jakarta", "jnj": "janji", "jth": "jatuh", "jurdil": "jujur adil", "jwb": "jawab", "ka": "kakak", "kabag": "kepala bagian", "kacian": "kasihan", "kaga": "tidak", "kaka": "kakak", "kamuh": "kamu", "kamyu": "kamu", "kau": "kamu", "kbar": "kabar", "kcian": "kasihan", "keburu": "terlanjur", "kedubes": "kedutaan besar", "kek": "seperti", "keknya": "kayaknya", "keliatan": "kelihatan", "keneh": "masih", "kepikiran": "terpikirkan", "kepo": "mau tahu urusan orang", "kere": "tidak punya uang", "kesian": "kasihan", "ketauan": "ketahuan", "keukeuh": "keras kepala", "khan": "kan", "kk": "kakak", "klian": "kalian", "klo": "kalau", "kluarga": "keluarga", "klwrga": "keluarga", "kmari": "kemari", "kmpus": "kampus", "kn": "kan", "knl": "kenal", "knpa": "kenapa", "kog": "kok", "kompi": "komputer", "koq": "kok", "kpd": "kepada", "kptsan": "keputusan", "krik": "garing", "krn": "karena", "ktauan": "ketahuan", "ktny": "katanya", "kudu": "harus", "kuq": "kok", "ky": "seperti", "kykny": "kayanya", "laka": "kecelakaan", "lambreta": "lambat", "lansia": "lanjut usia", "lbur": "libur", "lekong": "laki-laki", "lg": "lagi", "lgkp": "lengkap", "lht": "lihat", "lmyan": "lumayan", "lngkp": "lengkap", "loch": "loh", "lol": "tertawa", "lom": "belum", "loupz": "cinta", "lowh": "kamu", "lu": "kamu", "luchu": "lucu", "luff": "cinta", "luph": "cinta", "lw": "kamu", "lwt": "lewat", "maaciw": "terima kasih", "mabes": "markas besar", "macem-macem": "macam-macam", "madesu": "masa depan suram", "maen": "main", "mahatma": "maju sehat bersama", "mak": "ibu", "makasih": "terima kasih", "malah": "bahkan", "malu2in": "memalukan", "mamz": "makan", "manies": "manis", "mantep": "mantap", "markus": "makelar kasus", "mba": "mbak", "mending": "lebih baik", "mgkn": "mungkin", "mhn": "mohon", "mksd": "maksud", "mls": "malas", "mnt": "minta", "mokat": "mati", "mosok": "masa", "msh": "masih", "mskpn": "meskipun", "msng2": "masing-masing", "muahal": "mahal", "muker": "musyawarah kerja", "mumet": "pusing", "muna": "munafik", "muup": "maaf", "muuv": "maaf", "nal": "kenal", "nangis": "menangis", "naon": "apa", "naq": "anak", "narsis": "bangga pada diri sendiri", "nax": "anak", "ndak": "tidak", "nelfon": "menelepon", "ngabis2in": "menghabiskan", "ngakak": "tertawa", "ngambek": "marah", "ngampus": "pergi ke kampus", "ngantri": "mengantri", "ngapain": "sedang apa", "ngaruh": "berpengaruh", "ngawur": "berbicara sembarangan", "ngeh": "sadar", "ngekos": "tinggal di kos", "ngelamar": "melamar", "ngeliat": "melihat", "ngemeng": "bicara terus-terusan", "ngerti": "mengerti", "nggak": "tidak", "ngikut": "ikut", "nginep": "menginap", "ngisi": "mengisi", "ngmg": "bicara", "ngocol": "lucu", "ngomongin": "membicarakan", "ngumpul": "berkumpul", "ni": "ini", "nyasar": "tersesat", "nyariin": "mencari", "nyiapin": "mempersiapkan", "nyiram": "menyiram", "nyok": "ayo", "o/": "oleh", "ok": "ok", "priksa": "periksa", "pro": "profesional", "psn": "pesan", "psti": "pasti", "puanas": "panas", "qmo": "kamu", "qt": "kita", "rame": "ramai", "raskin": "rakyat miskin", "reg": "register", "rejeki": "rezeki", "sni": "sini", "somse": "sombong sekali", "sorry": "maaf", "sowry": "maaf", "spd": "sepeda", "sprti": "seperti", "spy": "supaya", "stelah": "setelah", "sy": "saya", "syp": "siapa", "tar": "nanti", "taun": "tahun", "tawh": "tahu", "tdi": "tadi", "te2p": "tetap", "tekor": "rugi", "telkom": "telekomunikasi", "telp": "telepon", "temen2": "teman-teman", "tengok": "menjenguk", "terbitin": "terbitkan", "tgl": "tanggal", "thanks": "terima kasih", "thd": "terhadap", "thx": "terima kasih", "tkg": "tukang", "tll": "terlalu", "tlpn": "telepon", "tman": "teman", "tmbh": "tambah", "tmn2": "teman-teman", "tmph": "tumpah", "tnda": "tanda", "tnh": "tanah", "tp": "tapi", "tq": "terima kasih", "trgntg": "tergantung", "trims": "terima kasih", "cb": "coba", "y": "ya", "munfik": "munafik", "sma": "sama", "tren": "trend", "ngehe": "kesal", "mz": "mas", "analisise": "analisis", "sadaar": "sadar", "sept": "september", "nmenarik": "menarik", "zonk": "bodoh", "rights": "benar", "simiskin": "miskin", "ngumpet": "sembunyi", "hardcore": "keras", "akhirx": "akhirnya", "solve": "solusi", "watuk": "batuk", "ngebully": "intimidasi", "masy": "masyarakat", "still": "masih", "tauk": "tahu", "mbual": "bual", "tioghoa": "tionghoa", "ngentotin": "senggama", "kentot": "senggama", "faktakta": "fakta", "sohib": "teman", "rubahnn": "rubah", "trlalu": "terlalu", "nyela": "cela", "heters": "pembenci", "nyembah": "sembah", "most": "paling", "ikon": "lambang", "light": "terang", "pndukung": "pendukung", "setting": "atur", "seting": "akting", "next": "lanjut", "waspadalah": "waspada","nyerang": "serang", "nipu": "tipu", "ktipu": "tipu", "jentelmen": "berani", "buangbuang": "buang", "tsangka": "tersangka", "kurng": "kurang", "ista": "nista", "less": "kurang", "koar": "teriak", "paranoid": "takut", "problem": "masalah", "tahi": "kotoran", "happy": "bahagia", "tak": "tidak", "penertiban": "tertib", "uasai": "kuasa", "mnolak": "tolak", "trending": "trend", "taik": "tahi", "wkwkkw": "tertawa", "istaa": "nista", "benarjujur": "jujur", "mgkin": "mungkin"}
def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [11]:
clean_df.head()

Unnamed: 0,content,score
0,"Sangat lama banget ya dalam pengecekan nya, ti...",5
1,"Kok sekarang lewat BCA sih,, otomatis semua tr...",3
2,Sangat kecewa. Dari jam 11 kemaren siang trans...,1
3,Tidak menyarankan membayar kartu kredit memaka...,5
4,Kenapa top up google play ribet sekali sekaran...,4


In [12]:
clean_df['text_clean'] = clean_df['content'].apply(cleaningText)

clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)

clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)

clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)

clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)

clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

In [13]:
clean_df.head()

Unnamed: 0,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
0,"Sangat lama banget ya dalam pengecekan nya, ti...",5,Sangat lama banget ya dalam pengecekan nya tid...,sangat lama banget ya dalam pengecekan nya tid...,sangat lama banget iya dalam pengecekan nya ti...,"[sangat, lama, banget, iya, dalam, pengecekan,...","[banget, pengecekan, cepat, mohon, system, pen...",banget pengecekan cepat mohon system pengeceka...
1,"Kok sekarang lewat BCA sih,, otomatis semua tr...",3,Kok sekarang lewat BCA sih otomatis semua tran...,kok sekarang lewat bca sih otomatis semua tran...,kok sekarang lewat bca sih otomatis semua tran...,"[kok, sekarang, lewat, bca, sih, otomatis, sem...","[bca, otomatis, transaksinya, transfer, bank, ...",bca otomatis transaksinya transfer bank biaya ...
2,Sangat kecewa. Dari jam 11 kemaren siang trans...,1,Sangat kecewa Dari jam kemaren siang transaksi...,sangat kecewa dari jam kemaren siang transaksi...,sangat kecewa dari jam kemarin siang transaksi...,"[sangat, kecewa, dari, jam, kemarin, siang, tr...","[kecewa, jam, kemarin, siang, transaksi, pemba...",kecewa jam kemarin siang transaksi pembayaran ...
3,Tidak menyarankan membayar kartu kredit memaka...,5,Tidak menyarankan membayar kartu kredit memaka...,tidak menyarankan membayar kartu kredit memaka...,tidak menyarankan membayar kartu kredit memaka...,"[tidak, menyarankan, membayar, kartu, kredit, ...","[menyarankan, membayar, kartu, kredit, memakai...",menyarankan membayar kartu kredit memakai flip...
4,Kenapa top up google play ribet sekali sekaran...,4,Kenapa top up google play ribet sekali sekaran...,kenapa top up google play ribet sekali sekaran...,kenapa top up google play ribet sekali sekaran...,"[kenapa, top, up, google, play, ribet, sekali,...","[top, google, play, ribet, gogoplay, gk, pemba...",top google play ribet gogoplay gk pembayaranny...


## Pelabelan

In [14]:
import csv
import requests
from io import StringIO

In [15]:


lexicon_positive = dict()
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_positive[row[0]] = int(row[1])
else:
    print("Failed to fetch positive lexicon data")

lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_negative[row[0]] = int(row[1])
else:
    print("Failed to fetch negative lexicon data")


In [16]:

def sentiment_analysis_lexicon_indonesia(text):
    score = 0

    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    polarity=''


    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'

    return score, polarity


In [17]:
results = clean_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
print(clean_df['polarity'].value_counts())

polarity
positive    62999
neutral     32007
negative    26494
Name: count, dtype: int64


In [18]:
from imblearn.under_sampling import RandomUnderSampler
X = clean_df['text_akhir']
y = clean_df['polarity']

ros = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(pd.DataFrame(X), y)

resampled_df = pd.DataFrame({'text_akhir': X_resampled['text_akhir'], 'polarity': y_resampled})
resampled_df = resampled_df.dropna(subset=['text_akhir'])
print(resampled_df['polarity'].value_counts())

polarity
negative    26494
neutral     26494
positive    26494
Name: count, dtype: int64


In [19]:
# Encode label menjadi angka
label_encoder = LabelEncoder()
resampled_df['label'] = label_encoder.fit_transform(resampled_df['polarity'])

In [20]:
y = resampled_df['label']

## Feature Extraction

**TF-IDF**

In [21]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X_resampled['text_akhir'])

In [22]:
train_data_tfidf, temp_data_tfidf, train_labels_tfidf, temp_labels_tfidf = train_test_split(X_tfidf, y, test_size=0.5, random_state=42)
val_data_tfidf, test_data_tfidf, val_labels_tfidf, test_labels_tfidf = train_test_split(temp_data_tfidf, temp_labels_tfidf, test_size=0.4, random_state=42)

**Bag of Words**

In [23]:
from sklearn.feature_extraction.text import CountVectorizer


# Ekstraksi fitur menggunakan Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_resampled['text_akhir'])  # Menghasilkan sparse matrix

train_data_bow, temp_data_bow, train_labels_bow, temp_labels_bow = train_test_split(X_bow, y, test_size=0.5, random_state=42)
val_data_bow, test_data_bow, val_labels_bow, test_labels_bow = train_test_split(temp_data_bow, temp_labels_bow, test_size=0.2, random_state=42)

**Word Embedding**

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# Tokenisasi untuk X
max_features = 2500
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_resampled['text_akhir'])
X = tokenizer.texts_to_sequences(X_resampled['text_akhir'])
X = pad_sequences(X)

# Label encoding untuk y (pastikan y_resampled adalah array 1D)
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y_resampled)  # y_resampled harus 1D array
y = to_categorical(y)  # Mengubah ke format categorical setelah encoding

# Menentukan jumlah kelas
num_classes = y.shape[1]

# Memisahkan data latih, validasi, dan uji
train_data_GRU, temp_data_GRU, train_labels_GRU, temp_labels_GRU = train_test_split(X, y, test_size=0.5, random_state=42)
val_data_GRU, test_data_GRU, val_labels_GRU, test_labels_GRU = train_test_split(temp_data_GRU, temp_labels_GRU, test_size=0.2, random_state=42)

## Pelatihan Model

**GRU + Word Embedding**

In [25]:
from tensorflow.keras.callbacks import EarlyStopping

# Menggunakan early stopping untuk menghentikan pelatihan jika akurasi validasi dan uji di atas 86%
class CustomEarlyStopping(EarlyStopping):
    def __init__(self, monitor='val_accuracy', value=0.86, verbose=1, **kwargs):
        super(CustomEarlyStopping, self).__init__(monitor=monitor, verbose=verbose, **kwargs)
        self.value = value

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get(self.monitor)
        if current is None:
            return

        if current >= self.value and logs.get('accuracy') >= self.value:
            self.model.stop_training = True
            if self.verbose > 0:
                print(f'\nEpoch {epoch + 1}: early stopping threshold reached.')

early_stopping = CustomEarlyStopping(monitor='val_accuracy', value=0.86, patience=3, verbose=1)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Dense, Dropout, GRU, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
# Membangun model GRU
model_GRU = Sequential()
model_GRU.add(Embedding(max_features, 512, input_length=X.shape[1]))
model_GRU.add(SpatialDropout1D(0.3))
model_GRU.add(Bidirectional(GRU(64, return_sequences=True)))
model_GRU.add(Bidirectional(GRU(128, return_sequences=False)))
model_GRU.add(Dense(128, activation='relu',kernel_regularizer=l2(0.01)))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(num_classes, activation='softmax'))
model_GRU.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Melatih model dengan early stopping
batch_size = 32
history_GRU = model_GRU.fit(train_data_GRU, train_labels_GRU, epochs=10, batch_size=batch_size, verbose=2, validation_data=(X_val_GRU, y_val_GRU),callbacks=[early_stopping])

Epoch 1/10





Epoch 1: early stopping threshold reached.
994/994 - 107s - 108ms/step - accuracy: 0.8715 - loss: 0.5549 - val_accuracy: 0.9404 - val_loss: 0.2130


In [34]:
# Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_GRU = model_GRU.predict(train_data_GRU)
y_pred_test_GRU = model_GRU.predict(test_data_GRU)

# Konversi prediksi ke label biner
y_pred_train_GRU = (y_pred_train_GRU > 0.5).astype(int)
y_pred_test_GRU = (y_pred_test_GRU > 0.5).astype(int)

# Evaluasi akurasi model GRU
accuracy_train_GRU = accuracy_score(train_labels_GRU, y_pred_train_GRU)
accuracy_test_GRU = accuracy_score(test_labels_GRU, y_pred_test_GRU)

# Menampilkan akurasi
print('GRU - accuracy_train:', accuracy_train_GRU)
print('GRU - accuracy_test:', accuracy_test_GRU)

# Menampilkan classification report
print('Classification Report:')
print(classification_report(test_labels_GRU, y_pred_test_GRU))

[1m1242/1242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step
GRU - accuracy_train: 0.9393070129085831
GRU - accuracy_test: 0.9294250849163417
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      2588
           1       0.98      0.90      0.94      2675
           2       0.96      0.94      0.95      2686

   micro avg       0.96      0.93      0.94      7949
   macro avg       0.96      0.93      0.94      7949
weighted avg       0.96      0.93      0.94      7949
 samples avg       0.93      0.93      0.93      7949



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**BOW + SVM Model**

In [24]:
# Model SVM dengan BOW
bow_svm_model = SVC(C=10,gamma=0.1)
bow_svm_model.fit(train_data_bow, train_labels_bow)
svm_pred = bow_svm_model.predict(test_data_bow)

In [25]:
# Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_svm = bow_svm_model.predict(train_data_bow)
y_pred_test_svm = bow_svm_model.predict(test_data_bow)

# Evaluasi akurasi model SVM
accuracy_train_svm = accuracy_score(y_pred_train_svm, train_labels_bow)
accuracy_test_svm = accuracy_score(y_pred_test_svm, test_labels_bow)

# Menampilkan akurasi
print('SVM - accuracy_train:', accuracy_train_svm)
print('SVM - accuracy_test:', accuracy_test_svm)
# Evaluasi model
print(classification_report(test_labels_bow, y_pred_test_svm))

SVM - accuracy_train: 0.9992954379607961
SVM - accuracy_test: 0.9399924518807398
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      2588
           1       0.96      0.93      0.95      2675
           2       0.97      0.92      0.94      2686

    accuracy                           0.94      7949
   macro avg       0.94      0.94      0.94      7949
weighted avg       0.94      0.94      0.94      7949



**Random Forest Model + TF-IDF**

In [28]:
# Model Random Forest dengan TF-IDF
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_data_tfidf, train_labels_tfidf)
rf_pred = rf_model.predict(test_data_tfidf)

In [29]:
# Prediksi pada data train dan test
y_pred_train_rf = rf_model.predict(train_data_tfidf)
y_pred_test_rf = rf_model.predict(test_data_tfidf)

# Evaluasi akurasi model Random Forest
accuracy_train_rf = accuracy_score(y_pred_train_rf, train_labels_tfidf)
accuracy_test_rf = accuracy_score(y_pred_test_rf, test_labels_tfidf)

# Menampilkan akurasi
print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

# Evaluasi model
print(classification_report(test_labels_tfidf, y_pred_test_rf))

Random Forest - accuracy_train: 0.9912684632998666
Random Forest - accuracy_test: 0.8998553186135749
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      5213
           1       0.94      0.91      0.92      5422
           2       0.91      0.88      0.90      5262

    accuracy                           0.90     15897
   macro avg       0.90      0.90      0.90     15897
weighted avg       0.90      0.90      0.90     15897



## Inference

In [35]:

# Input kalimat baru dari pengguna
kalimat_baru = "Menurut saya hal ini biasa saja."
            # "Aplikasi ini sangat bagus dan membantu!"
            # "Saya kecewa dengan layanan yang diberikan."
            # "Menurut saya hal ini biasa saja."

# Melakukan preprocessing pada kalimat baru
kalimat_baru_cleaned = cleaningText(kalimat_baru)
kalimat_baru_casefolded = casefoldingText(kalimat_baru_cleaned)
kalimat_baru_slangfixed = fix_slangwords(kalimat_baru_casefolded)
kalimat_baru_tokenized = tokenizingText(kalimat_baru_slangfixed)
kalimat_baru_filtered = filteringText(kalimat_baru_tokenized)
kalimat_baru_final = toSentence(kalimat_baru_filtered)


In [39]:
# Melakukan inference menggunakan model GRU
new_test_gru = tokenizer.texts_to_sequences([kalimat_baru_final])
new_test_gru = pad_sequences(new_test_gru, maxlen=1000)


In [31]:

new_test_tfidf = tfidf_vectorizer.transform([kalimat_baru_final])

In [32]:
new_test_bow = vectorizer.transform([kalimat_baru_final])

In [33]:
new_svm_pred = bow_svm_model.predict(new_test_bow)

# Menampilkan hasil prediksi
if new_svm_pred[0] == 0:
    print("Sentimen kalimat baru adalah Positive.")
elif new_svm_pred[0] == 1:
    print("Sentimen kalimat baru adalah Neutral.")
else:
    print("Sentimen kalimat baru adalah Negative.")

Sentimen kalimat baru adalah Neutral.


In [40]:
new_gru_pred = model_GRU.predict(new_test_gru)
predicted_class = new_gru_pred.argmax(axis=-1)[0]

# Menampilkan hasil prediksi
if predicted_class == 0:
    print("Sentimen kalimat baru adalah Negative.")
elif predicted_class == 1:
    print("Sentimen kalimat baru adalah Neutral.")
else:
    print("Sentimen kalimat baru adalah Positive.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
Sentimen kalimat baru adalah Neutral.


In [35]:
new_rf_pred = rf_model.predict(new_test_tfidf)

# Menampilkan hasil prediksi
if new_rf_pred[0] == 0:
    print("Sentimen kalimat baru adalah Positive.")
elif new_rf_pred[0] == 1:
    print("Sentimen kalimat baru adalah Neutral.")
else:
    print("Sentimen kalimat baru adalah Negative.")

Sentimen kalimat baru adalah Neutral.
