In [1]:
import pandas as pd
from gensim.models import HdpModel
from gensim import corpora
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

In [3]:
# Baca data
df = pd.read_csv('../data/dataHasilPreprocessing/hasilPreprocessing1.csv')
df['Sentimen'] = df['Sentimen'].map({'Positif': 1, 'Negatif': 0})
df

Unnamed: 0,Ulasan,Sentimen
0,ulas,0
1,,0
2,bahan aja warna navy nya beda,0
3,kasih catat order warna kuning navy baca kirim...,0
4,kecil,0
...,...,...
990,barang selamat terima kasih bahan celana nya s...,0
991,kecil,0
992,pesan cuna barang rusak coba konfirmasi tanggap,0
993,karet pinggang nya kencang,0


In [4]:
# Hapus duplikat dan nilai yang hilang
df = df.drop_duplicates(subset=['Ulasan'])
df = df.dropna()

In [5]:
tokenized_reviews = [review.lower().split() for review in df['Ulasan']]
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]

In [6]:
hdp_model = HdpModel(corpus, dictionary)
num_topics = 5  # Misalnya, kita akan menggunakan 5 topik

In [7]:
# Mendapatkan distribusi topik untuk setiap ulasan
topic_distributions = [hdp_model[doc] for doc in corpus]

# Mendapatkan bobot topik untuk setiap ulasan
aspect_list = []
for dist in topic_distributions:
    aspect = max(dist, key=lambda x: x[1])[0]  # Ambil topik dengan bobot terbesar
    aspect_list.append(aspect)

In [8]:
# Tambahkan atribut aspek ke dalam dataframe
df['Aspect'] = aspect_list

In [9]:
# Model BERT
model_name = 'indobenchmark/indobert-base-p1'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Pra-pemrosesan data untuk BERT
reviews = df['Ulasan'].tolist()
labels = df['Sentimen'].tolist()

In [11]:
max_length = 128
input_ids = []
attention_masks = []

# Tokenisasi data
for review in reviews:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
labels = tf.convert_to_tensor(labels)

In [13]:
# Split data menjadi train dan test sets
train_indices, test_indices = train_test_split(range(len(input_ids)), test_size=0.2, random_state=42)
train_indices = tf.convert_to_tensor(train_indices, dtype=tf.int32)
test_indices = tf.convert_to_tensor(test_indices, dtype=tf.int32)

train_input_ids = tf.gather(input_ids, train_indices)
train_attention_masks = tf.gather(attention_masks, train_indices)
train_labels = tf.gather(labels, train_indices)

test_input_ids = tf.gather(input_ids, test_indices)
test_attention_masks = tf.gather(attention_masks, test_indices)
test_labels = tf.gather(labels, test_indices)

In [14]:
# Konfigurasi pelatihan
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [15]:
# Latih model
batch_size = 16
epochs = 15

history = model.fit(
    [train_input_ids, train_attention_masks],
    train_labels,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([test_input_ids, test_attention_masks], test_labels)
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [16]:
# Evaluasi model
model.evaluate([test_input_ids, test_attention_masks], test_labels)



[0.6652217507362366, 0.878947377204895]

In [17]:
# Buat prediksi
test_predictions = model.predict([test_input_ids, test_attention_masks])
predicted_labels = tf.argmax(test_predictions.logits, axis=1)



In [20]:
# Tampilkan aspek dalam dataframe
df_aspect = pd.DataFrame({'Aspect': aspect_list, 'Sentimen': df['Sentimen'], 'Ulasan': df['Ulasan']})
df_aspect

Unnamed: 0,Aspect,Sentimen,Ulasan
0,76,0,ulas
2,147,0,bahan aja warna navy nya beda
3,135,0,kasih catat order warna kuning navy baca kirim...
4,104,0,kecil
5,41,0,tau layan
...,...,...,...
989,61,0,cocok
990,58,0,barang selamat terima kasih bahan celana nya s...
992,99,0,pesan cuna barang rusak coba konfirmasi tanggap
993,81,0,karet pinggang nya kencang


In [21]:
# Tampilkan laporan klasifikasi
classification_rep = classification_report(test_labels, predicted_labels)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.50      0.35      0.41        23
           1       0.91      0.95      0.93       167

    accuracy                           0.88       190
   macro avg       0.71      0.65      0.67       190
weighted avg       0.86      0.88      0.87       190

