In [62]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tensorflow.keras.layers import Layer



In [63]:
df = pd.read_csv('shopee.csv')
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,a22da68d-4a31-473a-b6a2-a8a95e0511a6,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Terlalu banyak video wanita berpakaian minim p...,1,6,3.47.40,2025-04-13 23:59:25,"Hai kak Ganggaswara Permana, makasih ya buat b...",2025-04-13 21:09:47,3.47.40
1,8ed6168d-b897-4f81-8307-48926edf8984,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,recommended..,5,0,3.47.40,2025-04-13 23:11:13,"Hai kak Dodi , makasih ya buat bintang 5 nya. ...",2025-04-14 00:28:29,3.47.40
2,ebe6a755-2d6d-4d80-85ca-f89b758183fc,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,semoga tambah lebih bagus dan murah juga ya ba...,5,0,3.47.39,2025-04-13 22:56:44,"Hai kak Firda Zhang, makasih bgt komentar dan ...",2025-04-14 00:36:13,3.47.39
3,694eb065-3998-4198-b0c4-71f4adc660c2,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Kenapa tidak ada fitur blokir chat?,2,0,3.47.40,2025-04-13 22:48:35,"Hi kak Burhani Mutiara Nublah , maaf ya terkai...",2025-04-14 00:38:10,3.47.40
4,31cead75-df3b-442a-a99a-8514bd23714e,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"beli token, gak bisa kepake, kadang bisa kadan...",1,0,3.47.40,2025-04-13 22:20:16,"Hai kak Ryu Sent , maaf yaa udh buat ga nyaman...",2025-04-14 00:23:37,3.47.40


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              10000 non-null  object
 1   userName              10000 non-null  object
 2   userImage             10000 non-null  object
 3   content               9999 non-null   object
 4   score                 10000 non-null  int64 
 5   thumbsUpCount         10000 non-null  int64 
 6   reviewCreatedVersion  8090 non-null   object
 7   at                    10000 non-null  object
 8   replyContent          8849 non-null   object
 9   repliedAt             8849 non-null   object
 10  appVersion            8090 non-null   object
dtypes: int64(2), object(9)
memory usage: 859.5+ KB


In [65]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [66]:
df = df[df['score'] != 3]

# Label: 0 = negatif, 1 = positif
df['sentiment'] = df['score'].apply(lambda x: 0 if x <= 2 else 1)

# Cek distribusi label
df['sentiment'].value_counts()

1    4042
0    2804
Name: sentiment, dtype: int64

In [67]:
# Inisialisasi stopword remover
factory = StopWordRemoverFactory()
stopwords = set(factory.get_stop_words())

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return ' '.join([word for word in text.split() if word not in stopwords])

df['clean_text'] = df['content'].astype(str).apply(clean_text)


In [68]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded = pad_sequences(sequences, maxlen=150, padding='post')

In [69]:
tokenized_texts = [text.split() for text in df['clean_text']]
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [81]:
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(0, 0.01, embedding_dim)

In [82]:
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()

    def call(self, inputs):
        score = tf.nn.tanh(inputs)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

In [83]:
X_train, X_test, y_train, y_test = train_test_split(
    padded, df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment']
)

In [84]:
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              embeddings_initializer=Constant(embedding_matrix),
              trainable=True),

    Bidirectional(GRU(128, return_sequences=True)),
    Bidirectional(GRU(64, return_sequences=True)),  # ✅ penting!
    Attention(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [85]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32,  
    callbacks=[early_stop]
)

Epoch 1/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 132ms/step - accuracy: 0.7348 - loss: 0.5437 - val_accuracy: 0.7898 - val_loss: 0.4638
Epoch 2/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 128ms/step - accuracy: 0.8197 - loss: 0.4143 - val_accuracy: 0.8241 - val_loss: 0.5136
Epoch 3/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 131ms/step - accuracy: 0.9184 - loss: 0.2125 - val_accuracy: 0.9066 - val_loss: 0.2572
Epoch 4/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 129ms/step - accuracy: 0.9629 - loss: 0.1147 - val_accuracy: 0.9015 - val_loss: 0.3581
Epoch 5/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 136ms/step - accuracy: 0.9825 - loss: 0.0611 - val_accuracy: 0.8993 - val_loss: 0.3869
Epoch 6/15
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 133ms/step - accuracy: 0.9901 - loss: 0.0361 - val_accuracy: 0.8912 - val_loss: 0.3995


In [86]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Akurasi Uji: {acc*100:.2f}%")

[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - accuracy: 0.9094 - loss: 0.2445
Akurasi Uji: 90.66%
