In [1]:
import pandas as pd
import re
import string
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('shopee.csv')
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,a22da68d-4a31-473a-b6a2-a8a95e0511a6,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Terlalu banyak video wanita berpakaian minim p...,1,6,3.47.40,2025-04-13 23:59:25,"Hai kak Ganggaswara Permana, makasih ya buat b...",2025-04-13 21:09:47,3.47.40
1,8ed6168d-b897-4f81-8307-48926edf8984,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,recommended..,5,0,3.47.40,2025-04-13 23:11:13,"Hai kak Dodi , makasih ya buat bintang 5 nya. ...",2025-04-14 00:28:29,3.47.40
2,ebe6a755-2d6d-4d80-85ca-f89b758183fc,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,semoga tambah lebih bagus dan murah juga ya ba...,5,0,3.47.39,2025-04-13 22:56:44,"Hai kak Firda Zhang, makasih bgt komentar dan ...",2025-04-14 00:36:13,3.47.39
3,694eb065-3998-4198-b0c4-71f4adc660c2,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Kenapa tidak ada fitur blokir chat?,2,0,3.47.40,2025-04-13 22:48:35,"Hi kak Burhani Mutiara Nublah , maaf ya terkai...",2025-04-14 00:38:10,3.47.40
4,31cead75-df3b-442a-a99a-8514bd23714e,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"beli token, gak bisa kepake, kadang bisa kadan...",1,0,3.47.40,2025-04-13 22:20:16,"Hai kak Ryu Sent , maaf yaa udh buat ga nyaman...",2025-04-14 00:23:37,3.47.40


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              10000 non-null  object
 1   userName              10000 non-null  object
 2   userImage             10000 non-null  object
 3   content               9999 non-null   object
 4   score                 10000 non-null  int64 
 5   thumbsUpCount         10000 non-null  int64 
 6   reviewCreatedVersion  8090 non-null   object
 7   at                    10000 non-null  object
 8   replyContent          8849 non-null   object
 9   repliedAt             8849 non-null   object
 10  appVersion            8090 non-null   object
dtypes: int64(2), object(9)
memory usage: 859.5+ KB


In [4]:
df.dropna(subset=['content'], inplace=True)
df.drop_duplicates(subset=['content'], inplace=True)

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

df['cleaned'] = df['content'].apply(clean_text)


In [7]:
def label_sentiment(score):
    if score >= 4:
        return 'positif'
    elif score == 3:
        return 'netral'
    else:
        return 'negatif'

df['sentiment'] = df['score'].apply(label_sentiment)

In [8]:
df['sentiment'].value_counts()

negatif    4224
positif    3959
netral      455
Name: sentiment, dtype: int64

In [9]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # 0: negative, 1: neutral, 2: positive
y = to_categorical(df['label'], num_classes=3)

In [10]:
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned'])

sequences = tokenizer.texts_to_sequences(df['cleaned'])
X = pad_sequences(sequences, maxlen=max_len, padding='post')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [15]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 kelas

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [16]:
early_stop = EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(X_train, y_train, epochs=8, batch_size=32,
                    validation_split=0.2, callbacks=[early_stop])


Epoch 1/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 103ms/step - accuracy: 0.4612 - loss: 0.9393 - val_accuracy: 0.4783 - val_loss: 0.8867
Epoch 2/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 101ms/step - accuracy: 0.4818 - loss: 0.8687 - val_accuracy: 0.4783 - val_loss: 0.8944
Epoch 3/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 102ms/step - accuracy: 0.4938 - loss: 0.8561 - val_accuracy: 0.4595 - val_loss: 0.8863
Epoch 4/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 101ms/step - accuracy: 0.4713 - loss: 0.8754 - val_accuracy: 0.4783 - val_loss: 0.8837
Epoch 5/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 101ms/step - accuracy: 0.4869 - loss: 0.8574 - val_accuracy: 0.4783 - val_loss: 0.8848
Epoch 6/8
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 103ms/step - accuracy: 0.4857 - loss: 0.8528 - val_accuracy: 0.4595 - val_loss: 0.8835
Epoch 7/8
[1m17

In [17]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Akurasi testing set: {accuracy * 100:.2f}%")


[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.4693 - loss: 0.8652
Akurasi testing set: 48.90%
