In [4]:
import re
import pandas as pd
import unicodedata
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import joblib

In [5]:
data = pd.read_csv("data/dataset_unlabeled_penyisihan_bdc_2024.csv", delimiter=";")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IDText  1000 non-null   object
 1   Text    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [7]:

# Fungsi membersihkan teks tanpa stemming/lemmatisasi
def clean_text(text):
    text = str(text).lower()

    # 1. Hapus RT, via, cc di awal
    text = re.sub(r'^(RT|rt|via|cc)\b', '', text).strip()

    # 2. Hapus mention @username
    text = re.sub(r'@\w+', '', text)

    # 3. Hapus URL
    text = re.sub(r'http\S+', '', text)

    # 4. Hapus hashtag
    text = re.sub(r'#\S+', '', text)

    # 5. Hapus bracket [RE ...] atau yang sejenis
    text = re.sub(r'\[.*?\]', '', text)

    # 6. Hapus encoding random (+ECNv...= dsb)
    text = re.sub(r'\S*=\S*', '', text)

    # 7. Hapus karakter non-ASCII dan simbol aneh
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 8. Normalisasi unicode (hilangkan diakritik tak perlu)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # 9. Hapus tanda baca
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # 10. Ganti & menjadi "dan"
    text = text.replace("&", " dan ")

    # 11. Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Terapkan ke kolom data
data['clean_text'] = data['Text'].apply(clean_text)


In [8]:
# Hapus baris jika kolom 'clean_text' kosong atau hanya berisi spasi
data = data[data['clean_text'].astype(str).str.strip() != ""]

In [9]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
    return stemmer.stem(text)

# 2. Terapkan stemming
data['stemmed_text'] = data['clean_text'].apply(stem_text)

In [10]:
data.head()

Unnamed: 0,IDText,Text,clean_text,stemmed_text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau org2 prodemokrasi di negara ini bisa pu...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo tanya soal hutang luar negeri dia jawa...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itu beliau sosok yan...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa laku itu semua demi s...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjung elu aomkmkm...


In [11]:
data.to_csv("data/testData.csv", index=False)

In [12]:
tes_data = pd.read_csv("data/testData.csv", delimiter=",")

In [13]:


# Load vectorizer yang sudah di-fit
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Transform data test
X_test_new = vectorizer.transform(tes_data['stemmed_text'])

In [14]:
xgb_model= joblib.load("xgb_model.pkl")
lgb_model = joblib.load("lgb_model.pkl")

In [15]:
import numpy as np

le = joblib.load("label_encoder.pkl")

print("=== XGBOOST ===")

# Probabilitas prediksi
probs = xgb_model.predict_proba(X_test_new)

# Prediksi label
predictions = xgb_model.predict(X_test_new)
predicted_labels = le.inverse_transform(predictions)

# Confidence tertinggi tiap sampel
confidence_scores = probs.max(axis=1)

# Tambahkan ke DataFrame sementara
test_pred_df = pd.DataFrame({
    "predicted_label": predicted_labels,
    "confidence": confidence_scores
})

# Rata-rata confidence per kelas
avg_conf_per_class = test_pred_df.groupby("predicted_label")["confidence"].mean()

# Rata-rata keseluruhan
avg_conf_overall = confidence_scores.mean()

# Tampilkan hasil
print("=== Average Confidence per Class ===")
print(avg_conf_per_class)
print("\n=== Overall Average Confidence ===")
print(round(avg_conf_overall, 4))


=== XGBOOST ===
=== Average Confidence per Class ===
predicted_label
Demografi                  0.616803
Ekonomi                    0.863528
Geografi                   0.784952
Ideologi                   0.534091
Pertahanan dan Keamanan    0.678387
Politik                    0.739568
Sosial Budaya              0.513265
Sumber Daya Alam           0.535077
Name: confidence, dtype: float32

=== Overall Average Confidence ===
0.7235


In [16]:
import numpy as np
import pandas as pd
import joblib

le = joblib.load("label_encoder.pkl")

print("=== LGBM ===")

# Probabilitas prediksi
probs_lgb = lgb_model.predict_proba(X_test_new)

# Prediksi label
predictions_lgb = lgb_model.predict(X_test_new)
predicted_labels_lgb = le.inverse_transform(predictions_lgb)

# Confidence tertinggi tiap sampel
confidence_scores_lgb = np.max(probs_lgb, axis=1)

# DataFrame sementara
test_pred_df_lgb = pd.DataFrame({
    "predicted_label": predicted_labels_lgb,
    "confidence": confidence_scores_lgb
})

# Rata-rata confidence per kelas
avg_conf_per_class_lgb = test_pred_df_lgb.groupby("predicted_label")["confidence"].mean()

# Rata-rata keseluruhan
avg_conf_overall_lgb = confidence_scores_lgb.mean()

# Tampilkan hasil
print("=== Average Confidence per Class ===")
print(avg_conf_per_class_lgb)
print("\n=== Overall Average Confidence ===")
print(round(avg_conf_overall_lgb, 4))


=== LGBM ===
=== Average Confidence per Class ===
predicted_label
Demografi                  0.674272
Ekonomi                    0.880885
Geografi                   0.900583
Ideologi                   0.607178
Pertahanan dan Keamanan    0.736355
Politik                    0.811460
Sosial Budaya              0.615820
Sumber Daya Alam           0.636672
Name: confidence, dtype: float64

=== Overall Average Confidence ===
0.7908
