In [206]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential

In [207]:
df = pd.read_csv(r'dataset\balance.csv')
df.head()

Unnamed: 0,Rating,Description
0,5,Klenteng Ban Hin Kiong merupakan Klenteng tert...
1,5,Airnya sejuk. Tempatnya bebas plastik. Bagi ya...
2,5,Tiap minggu pasti kesini buat foto2 doang😁 kar...
3,5,"Pernah kesana pergi liat bunker jepang, naik p..."
4,5,"Mengikuti Talkshow "" Menyingkap Pesona Wastra ..."


In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 766 entries, 0 to 765
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Rating       766 non-null    int64 
 1   Description  746 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.1+ KB


In [209]:
df['Rating'].unique()

array([5, 4, 3, 2, 1], dtype=int64)

In [210]:
map_label = {1 : -1,
             2 : -1,
             3: 0,
             4 : 1,
             5 : 1}
df['Rating'] = df['Rating'].map(map_label)

In [211]:
df['Rating'].unique()

array([ 1,  0, -1], dtype=int64)

In [212]:
df['Rating'].value_counts()

Rating
 1    320
-1    286
 0    160
Name: count, dtype: int64

In [213]:
df['Description'] = df['Description'].astype(str)
df['Description'] = df['Description'].str.lower()


In [214]:
# memisahkan text dan label

text = df['Description'].tolist()
label = df['Rating'].tolist()

In [215]:
# parameter
vocab_size = 10000
max_length = 200
oov_tok = '<OOV>'
num_classes = 3
embedding_dim = 128
padding_type = 'post'
trunc_type = 'post'

In [216]:
# tokenisasi
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [217]:
# ubah ke sequence
sequence = tokenizer.texts_to_sequences(text)
padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [218]:
# split train dan test nya
X_train, X_test, y_train, y_test = train_test_split(padded, label, test_size=0.1, random_state=42)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [219]:
print(f'X_train shape : {X_train.shape}')
print(f'X_test shape : {X_test.shape}')
print(f'y_train shape : {y_train.shape}')
print(f'y_test shape : {y_test.shape}')

X_train shape : (689, 200)
X_test shape : (77, 200)
y_train shape : (689,)
y_test shape : (77,)


In [220]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(32, return_sequences=True),
    LSTM(32),
    Dense(24, activation='relu'),
    Dense(3, activation='softmax')  # Menggunakan softmax untuk klasifikasi tiga kelas
])



model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10
22/22 - 5s - loss: nan - accuracy: 0.2134 - val_loss: nan - val_accuracy: 0.2338 - 5s/epoch - 223ms/step
Epoch 2/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 931ms/epoch - 42ms/step
Epoch 3/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 937ms/epoch - 43ms/step
Epoch 4/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 886ms/epoch - 40ms/step
Epoch 5/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 883ms/epoch - 40ms/step
Epoch 6/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 876ms/epoch - 40ms/step
Epoch 7/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 886ms/epoch - 40ms/step
Epoch 8/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val_loss: nan - val_accuracy: 0.2338 - 868ms/epoch - 39ms/step
Epoch 9/10
22/22 - 1s - loss: nan - accuracy: 0.2061 - val

<keras.callbacks.History at 0x14a71a577f0>