In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from gensim.models import HdpModel, CoherenceModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Muhammad Ade
[nltk_data]     Aulia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Muhammad Ade
[nltk_data]     Aulia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load data from CSV file
df = pd.read_csv('../data/dataHasilPreprocessing/dataPreprocessing.csv')  # Ganti dengan path file Anda
df = df.drop_duplicates(subset=['Ulasan'])
df = df.dropna()
df

Unnamed: 0,Ulasan,Sentimen
0,ulas,Negatif
2,bahan aja warna navy nya beda,Negatif
3,kasih catat order warna kuning navy baca kirim...,Negatif
4,kecil,Negatif
5,tau layan,Negatif
...,...,...
989,cocok,Negatif
990,barang selamat terima kasih bahan celana nya s...,Negatif
992,pesan cuna barang rusak coba konfirmasi tanggap,Negatif
993,karet pinggang nya kencang,Negatif


In [3]:
# 2. Text Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Ulasan'])

In [4]:
# Label Encoding
le = LabelEncoder()
y = le.fit_transform(df['Sentimen'])

In [5]:
# 3. Ekstraksi Fitur (Hierarchical Dirichlet Process)
texts = [ulasan.split() for ulasan in df['Ulasan']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
hdp_model = HdpModel(corpus, id2word=dictionary)

In [6]:
# Menampilkan topik HDP
topik_hdp = hdp_model.show_topics(formatted=False)
print("Topik HDP:")
for topik in topik_hdp:
    print(topik)

Topik HDP:
(0, [('trmks', 0.004900698466158639), ('dadak', 0.004596919021991252), ('nyerep', 0.004489045659941597), ('order', 0.004190719515538947), ('you', 0.004027588570059528), ('utknini', 0.0039443486837279105), ('wajib', 0.003864449749293915), ('kait', 0.0038470701339477467), ('ademmm', 0.003555823090265283), ('menrt', 0.0035422246729427775), ('gt', 0.0035242686330645926), ('bulu', 0.0035034617762166277), ('keduaaa', 0.003495812359586424), ('hrga', 0.003400973852302613), ('rapihh', 0.0033920581781943463), ('confidence', 0.0033664558973947436), ('dapet', 0.0033239449931626707), ('video', 0.003299944776740896), ('maroom', 0.0032539436613667483), ('boarding', 0.0032293967351226948)])
(1, [('kuat', 0.006161014085006933), ('ta', 0.004816858441857924), ('sen', 0.004636780074283628), ('depan', 0.004373089111185836), ('coklat', 0.0041441683662584205), ('excellent', 0.004053029319523436), ('bet', 0.00402148327950938), ('wkwkwk', 0.00395143577876592), ('dikit', 0.003730888905785321), ('bala

In [7]:
# Menghitung skor koherensi
coherence_model_hdp = CoherenceModel(model=hdp_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_scores = coherence_model_hdp.get_coherence_per_topic()

# Membuat DataFrame untuk skor koherensi
num_topics = list(range(1, len(coherence_scores) + 1))
coherence_df = pd.DataFrame({'Jumlah Topik': num_topics, 'Skor Coherence': coherence_scores})

print("\nSkor Koherensi untuk tiap topik:")
coherence_df.head(20)


Skor Koherensi untuk tiap topik:


Unnamed: 0,Jumlah Topik,Skor Coherence
0,1,0.750271
1,2,0.789225
2,3,0.78651
3,4,0.757792
4,5,0.766993
5,6,0.783159
6,7,0.774918
7,8,0.772999
8,9,0.783844
9,10,0.796146


In [8]:
# Mengonversi HDP menjadi fitur
X_hdp = np.zeros((len(corpus), hdp_model.m_T))
for i, doc in enumerate(corpus):
    for topic_id, weight in hdp_model[doc]:
        if topic_id < hdp_model.m_T:
            X_hdp[i, topic_id] = weight

In [9]:
# 4. Klasifikasi menggunakan SVM
X_train, X_test, y_train, y_test = train_test_split(X_hdp, y, test_size=0.2, random_state=42)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [10]:
# 5. Evaluasi classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Classification Report:
              precision    recall  f1-score   support

     Negatif       0.00      0.00      0.00        29
     Positif       0.85      1.00      0.92       161

    accuracy                           0.85       190
   macro avg       0.42      0.50      0.46       190
weighted avg       0.72      0.85      0.78       190



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
