In [2]:
!pip install sastrawi



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
seed = 0
np.random.seed(seed)

from keras.layers import Dense, LeakyReLU, SimpleRNN, LSTM
from keras.models import Sequential
from keras.optimizers import Adam
from keras import models
from keras import layers

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import datetime as dt
import string
import re

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

import csv
import requests
from io import StringIO




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jayaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


# Memuat Dataset

In [4]:
app_reviews_df = pd.read_csv('dataset.csv')

In [5]:
jumlah_ulasan, jumlah_kolom = app_reviews_df.shape

In [6]:
app_reviews_df.head()

Unnamed: 0,Review
0,"Rusak scan,Qris sulit terbaca kadang tidak bis..."
1,"Apk udah bagus tapi masih berasa berat , kalau..."
2,Aplikasinya sangat bagus diawal2 nya terutama ...
3,Animasi promo yang kalian buat membuat aplikas...
4,aplikasi makin kesini makin lemot saja... pada...


# Menghapus Data Kosong dan Data Duplikat

In [7]:
app_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126000 entries, 0 to 125999
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Review  126000 non-null  object
dtypes: object(1)
memory usage: 984.5+ KB


In [8]:
clean_df = app_reviews_df.dropna()

In [9]:
clean_df.drop_duplicates(inplace=True)

In [10]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124209 entries, 0 to 125999
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Review  124209 non-null  object
dtypes: object(1)
memory usage: 1.9+ MB


In [11]:
jumlah_ulasan_bersih, jumlah_kolom_bersih = clean_df.shape

#Preprocessing Text

In [12]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')

    return text

In [13]:
def casefoldingText(text):
    text = text.lower()
    return text

In [14]:
def tokenizingText(text):
    text = word_tokenize(text)
    return text

In [15]:
def filteringText(text):
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku','di','ga','ya','gaa','loh','kah','woi','woii','woy'])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

In [16]:
def stemmingText(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    words = text.split()

    stemmed_words = [stemmer.stem(word) for word in words]

    stemmed_text = ' '.join(stemmed_words)

    return stemmed_text

In [17]:
def toSentence(list_words):
    sentence = ' '.join(word for word in list_words)
    return sentence

In [18]:
slangwords = {'@': 'di', 'abis': 'habis', 'wtb': 'beli', 'masi': 'masih', 'wts': 'jual', 'wtt': 'tukar', 'bgt': 'banget', 'maks': 'maksimal'}

def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

In [19]:
clean_df['text_clean'] = clean_df['Review'].apply(cleaningText)
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)
clean_df['text_stopwords'] = clean_df['text_tokenizingText'].apply(filteringText)
clean_df['final_text'] = clean_df['text_stopwords'].apply(toSentence)

In [20]:
clean_df.head()

Unnamed: 0,Review,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopwords,final_text
0,"Rusak scan,Qris sulit terbaca kadang tidak bis...",Rusak scanQris sulit terbaca kadang tidak bisa...,rusak scanqris sulit terbaca kadang tidak bisa...,rusak scanqris sulit terbaca kadang tidak bisa...,"[rusak, scanqris, sulit, terbaca, kadang, tida...","[rusak, scanqris, sulit, terbaca, kadang, ngeb...",rusak scanqris sulit terbaca kadang ngebaca su...
1,"Apk udah bagus tapi masih berasa berat , kalau...",Apk udah bagus tapi masih berasa berat kalau ...,apk udah bagus tapi masih berasa berat kalau ...,apk udah bagus tapi masih berasa berat kalau b...,"[apk, udah, bagus, tapi, masih, berasa, berat,...","[apk, udah, bagus, berasa, berat, latar, gamba...",apk udah bagus berasa berat latar gambar apk g...
2,Aplikasinya sangat bagus diawal2 nya terutama ...,Aplikasinya sangat bagus diawal nya terutama t...,aplikasinya sangat bagus diawal nya terutama t...,aplikasinya sangat bagus diawal nya terutama t...,"[aplikasinya, sangat, bagus, diawal, nya, teru...","[aplikasinya, bagus, diawal, transaksinya, mem...",aplikasinya bagus diawal transaksinya membantu...
3,Animasi promo yang kalian buat membuat aplikas...,Animasi promo yang kalian buat membuat aplikas...,animasi promo yang kalian buat membuat aplikas...,animasi promo yang kalian buat membuat aplikas...,"[animasi, promo, yang, kalian, buat, membuat, ...","[animasi, promo, aplikasi, patah, patah, mengs...",animasi promo aplikasi patah patah mengscroll ...
4,aplikasi makin kesini makin lemot saja... pada...,aplikasi makin kesini makin lemot saja padahal...,aplikasi makin kesini makin lemot saja padahal...,aplikasi makin kesini makin lemot saja padahal...,"[aplikasi, makin, kesini, makin, lemot, saja, ...","[aplikasi, kesini, lemot, koneksi, bagus, siny...",aplikasi kesini lemot koneksi bagus sinyal man...


# Memberi Label pada Data

In [21]:
lexicon_positive = dict()
lexicon_negative = dict()

In [22]:
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')

In [23]:
if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')

    for row in reader:
        lexicon_positive[row[0]] = int(row[1])

else:
    print('Gagal')

In [24]:
if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')

    for row in reader:
        lexicon_negative[row[0]] = int(row[1])

else:
    print('Gagal')

In [25]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0

    for word in text:
        if(word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if(word in lexicon_negative):
            score = score + lexicon_negative[word]

    polarity=''

    if(score>0):
        polarity = 'positive'
    elif(score<0):
        polarity = 'negative'
    else:
        polarity = 'neutral'

    return score, polarity

In [26]:
results = clean_df['text_stopwords'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
print(clean_df['polarity'].value_counts())

polarity
negative    117321
neutral       6888
Name: count, dtype: int64


# Data Splitting dan Ektraksi Fitur

In [27]:
x = clean_df['final_text']
y = clean_df['polarity']

In [28]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [29]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y_encoded, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [30]:
tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8)

In [31]:
x_train_tfidf = tfidf.fit_transform(x_train)
x_val_tfidf = tfidf.transform(x_val)
x_test_tfidf = tfidf.transform(x_test)

In [34]:
features_df = pd.DataFrame(x_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [35]:
features_df

Unnamed: 0,admin,aja,aktif,akun,alasan,aman,aneh,apapun,apk,aplikasi,...,ulang,update,upgrade,verifikasi,versi,via,voucher,wa,wajah,yg
0,0.0,0.000000,0.0,0.0,0.0,0.335198,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.222624
1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
2,0.0,0.215716,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.318042,...,0.338364,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.207114,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99362,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.233016,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
99363,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.262073,...,0.000000,0.394254,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
99364,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.159598,...,0.000000,0.000000,0.0,0.0,0.0,0.173676,0.0,0.0,0.0,0.000000
99365,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.165260,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


# Pembuatan Model

In [36]:
model = Sequential([
    LSTM(8, return_sequences=True, input_shape=(1, x_train_tfidf.shape[1])),
    LSTM(16, return_sequences=True),
    LSTM(32),
    Dense(16),
    LeakyReLU(),
    Dense(3, activation='softmax')
])




In [37]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 8)              6688      
                                                                 
 lstm_1 (LSTM)               (None, 1, 16)             1600      
                                                                 
 lstm_2 (LSTM)               (None, 32)                6272      
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 51        
                                                                 
Total params: 15139 (59.14 KB)
Trainable params: 15139 (

# Pelatihan Model

In [46]:
x_train_reshaped = np.reshape(x_train_tfidf.toarray(), (x_train_tfidf.shape[0], 1, x_train_tfidf.shape[1]))
x_val_reshaped = np.reshape(x_val_tfidf.toarray(), (x_val_tfidf.shape[0], 1, x_val_tfidf.shape[1]))

model.fit(x_train_reshaped, y_train, epochs=10, validation_data=(x_val_reshaped, y_val))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x243520c72e0>

# Prediksi Model

In [47]:
x_test_reshaped = np.reshape(x_test_tfidf.toarray(), (x_test_tfidf.shape[0], 1, x_test_tfidf.shape[1]))

In [48]:
y_pred = model.predict(x_test_reshaped)



# Evaluasi Model

In [49]:
y_pred_classes = np.argmax(y_pred, axis=1)

In [50]:
accuracy = round(accuracy_score(y_pred_classes, y_test), 4)

In [51]:
print('Akurasi Model LSTM :', accuracy)

Akurasi Model LSTM : 0.9609


In [52]:
print(classification_report(y_test, y_pred_classes))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     11747
           1       0.65      0.60      0.63       674

    accuracy                           0.96     12421
   macro avg       0.81      0.79      0.80     12421
weighted avg       0.96      0.96      0.96     12421



In [53]:
confusion_matrix(y_test, y_pred_classes)

array([[11528,   219],
       [  267,   407]], dtype=int64)

# Contoh Input

In [54]:
test_sample = x_test.sample(n=5)
test_sample

5510     nih apk ngeselin coba hp gk root muncul hp roo...
80846    tolong diperbaiki transfer sibuk klo topup uan...
62500    aman aman aja yah saldo kosong mantap lanjutin...
32308                                  membantu bermanfaat
93761    promo tp susah transaksinya aja bohong perbaik...
Name: final_text, dtype: object

In [55]:
test_sample_tfidf = tfidf.transform(test_sample)
test_sample_reshaped = np.reshape(test_sample_tfidf.toarray(), (test_sample_tfidf.shape[0], 1, test_sample_tfidf.shape[1]))

In [56]:
predicted_values = model.predict(test_sample_reshaped)
predicted_classes = np.argmax(predicted_values, axis=1)
sentiment = encoder.inverse_transform(predicted_classes)



In [57]:
pd.set_option('display.max_colwidth', None)

pd.DataFrame({'text': test_sample, 'sentiment': sentiment})

Unnamed: 0,text,sentiment
5510,nih apk ngeselin coba hp gk root muncul hp root memudahkan menyusahkan,negative
80846,tolong diperbaiki transfer sibuk klo topup uang sampe tertahan sehari menganggu,negative
62500,aman aman aja yah saldo kosong mantap lanjutin dana orang orang yg komen upgrade premium yah lemot,negative
32308,membantu bermanfaat,negative
93761,promo tp susah transaksinya aja bohong perbaiki alasannya signal lemah truss promo tp pake alias trouble teruss kecewa,negative
