In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.metrics import classification_report, confusion_matrix

print("Libraries imported successfully!")

Libraries imported successfully!


In [6]:
# Load Data
file_path = r"D:\Projects\tentier-streamlit\data\PRDECT-ID Dataset.csv"
df = pd.read_csv(file_path)
print(df.head())
print(df['Sentiment'].value_counts())

                Category                                       Product Name  \
0  Computers and Laptops  Wireless Keyboard i8 Mini TouchPad Mouse 2.4G ...   
1  Computers and Laptops  PAKET LISENSI WINDOWS 10 PRO DAN OFFICE 2019 O...   
2  Computers and Laptops                SSD Midasforce 128 Gb - Tanpa Caddy   
3  Computers and Laptops  ADAPTOR CHARGER MONITOR LCD LED TV LG merek LG...   
4  Computers and Laptops  ADAPTOR CHARGER MONITOR LCD LED TV LG merek LG...   

                 Location   Price  Overall Rating  Number Sold  Total Review  \
0           Jakarta Utara   53500             4.9         5449          2369   
1  Kota Tangerang Selatan   72000             4.9         2359          1044   
2           Jakarta Barat  213000             5.0        12300          3573   
3           Jakarta Timur   55000             4.7         2030           672   
4           Jakarta Timur   55000             4.7         2030           672   

   Customer Rating                          

In [7]:
# Preprocessing
df = df[['Customer Review', 'Sentiment']].dropna()
X = df['Customer Review'].astype(str)
y = df['Sentiment']

# Encode Labels
le = LabelEncoder()
y = le.fit_transform(y)
print("Classes:", le.classes_)

# Tokenization
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_len)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Classes: ['Negative' 'Positive']


In [8]:
# Build LSTM Model
embedding_dim = 128
num_classes = len(np.unique(y))

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

if num_classes == 2:
    model.add(Dense(1, activation='sigmoid'))
    loss = 'binary_crossentropy'
else:
    model.add(Dense(num_classes, activation='softmax'))
    loss = 'sparse_categorical_crossentropy'

model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [9]:
# Train Model
history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test), verbose=1)

Epoch 1/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 126ms/step - accuracy: 0.6800 - loss: 0.6021 - val_accuracy: 0.9000 - val_loss: 0.2547
Epoch 2/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 110ms/step - accuracy: 0.9390 - loss: 0.1739 - val_accuracy: 0.9324 - val_loss: 0.1688
Epoch 3/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 110ms/step - accuracy: 0.9770 - loss: 0.0861 - val_accuracy: 0.9380 - val_loss: 0.1528
Epoch 4/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 109ms/step - accuracy: 0.9868 - loss: 0.0468 - val_accuracy: 0.9426 - val_loss: 0.1591
Epoch 5/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 107ms/step - accuracy: 0.9911 - loss: 0.0325 - val_accuracy: 0.9444 - val_loss: 0.1680
Epoch 6/15
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 115ms/step - accuracy: 0.9947 - loss: 0.0219 - val_accuracy: 0.9472 - val_loss: 0.1736
Epoch 7/15
[1m68/68[0m [

In [10]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy}')

y_pred = model.predict(X_test)
if num_classes == 2:
    y_pred_classes = (y_pred > 0.5).astype(int)
else:
    y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

Test Accuracy: 0.9490740895271301
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
              precision    recall  f1-score   support

    Negative       0.95      0.95      0.95       557
    Positive       0.94      0.95      0.95       523

    accuracy                           0.95      1080
   macro avg       0.95      0.95      0.95      1080
weighted avg       0.95      0.95      0.95      1080



In [None]:
# Plot Training History
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss')
plt.show()

In [12]:
# Export Model and Tokenizer
import pickle
import os

models_dir = r'D:\Projects\tentier-streamlit\models'

# Save Model
model.save(os.path.join(models_dir, 'sentiment_model.h5'))
print(f"Model saved to {os.path.join(models_dir, 'sentiment_model.h5')}")

# Save Tokenizer
with open(os.path.join(models_dir, 'tokenizer.pickle'), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Tokenizer saved to {os.path.join(models_dir, 'tokenizer.pickle')}")



Model saved to D:\Projects\tentier-streamlit\models\sentiment_model.h5
Tokenizer saved to D:\Projects\tentier-streamlit\models\tokenizer.pickle
