In [48]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log
import random


In [None]:
# Konfigurasi
DATA_PATH   = "dataset/tes.csv"
TEST_RATIO  = 0.2
MIN_FREQ    = 2
MAX_FEATURES = 30000
C           = 1.0
EPOCHS      = 15
LR          = 0.1
SEED        = 42


In [50]:
def tokenize(s: str):
    # Tokenisasi sederhana (teks sudah bersih)
    return str(s).split()


In [51]:
def build_vocab(texts, min_freq=MIN_FREQ, max_features=MAX_FEATURES):
    freq = Counter()
    for t in texts:
        freq.update(tokenize(t))

    # Ambil token dengan frekuensi minimum
    items = [(tok, c) for tok, c in freq.items() if c >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)

    vocab = {tok: i for i, (tok, _) in enumerate(items[:max_features])}
    return vocab


In [52]:
def compute_idf(texts, vocab):
    N = len(texts)
    df = Counter()

    for t in texts:
        seen = set()
        for tok in tokenize(t):
            if tok in vocab and tok not in seen:
                df[tok] += 1
                seen.add(tok)

    idf = {
        tok: log((N + 1) / (df[tok] + 1)) + 1.0
        for tok in vocab.keys()
    }
    return idf


In [53]:
def vectorize(text, vocab, idf):
    cnt = Counter(tokenize(text))
    x = {}
    for tok, tf in cnt.items():
        idx = vocab.get(tok)
        if idx is not None:
            x[idx] = tf * idf.get(tok, 1.0)
    return x


In [54]:
def dot_sparse(w, x):
    return sum(w[j] * v for j, v in x.items())


In [55]:
def train_svm_sparse(X, y, dim, C=1.0, lr=0.1, epochs=10, seed=SEED):
    random.seed(seed)
    w = np.zeros(dim, dtype=np.float32)
    b = 0.0

    for ep in range(epochs):
        idxs = list(range(len(X)))
        random.shuffle(idxs)

        for i in idxs:
            xi, yi = X[i], y[i]

            # Regularisasi L2 sederhana
            w *= (1.0 - lr)

            margin = yi * (dot_sparse(w, xi) + b)

            if margin < 1.0:
                for j, v in xi.items():
                    w[j] += lr * C * yi * v
                b += lr * C * yi

        lr *= 0.9  # decay learning rate

    return w, b


In [56]:
def predict_sparse(w, b, xi):
    score = dot_sparse(w, xi) + b
    return 1 if score >= 0.0 else 0


In [57]:
df = pd.read_csv(DATA_PATH)

texts  = df["komentar_clean"].astype(str).tolist()
labels = df["label"].astype(int).tolist()


In [58]:
n = len(texts)
idxs = list(range(n))

random.seed(SEED)
random.shuffle(idxs)

split = int(n * (1.0 - TEST_RATIO))
train_idx = idxs[:split]
test_idx  = idxs[split:]


In [59]:
train_texts = [texts[i] for i in train_idx]
test_texts  = [texts[i] for i in test_idx]

y_train = np.array([1 if labels[i] == 1 else -1 for i in train_idx], dtype=np.int32)
y_test  = np.array([labels[i] for i in test_idx], dtype=np.int32)


In [60]:
vocab = build_vocab(train_texts, MIN_FREQ, MAX_FEATURES)
idf   = compute_idf(train_texts, vocab)

dim = len(vocab)
print("Jumlah fitur:", dim)


Jumlah fitur: 3799


In [61]:
X_train = [vectorize(t, vocab, idf) for t in train_texts]
X_test  = [vectorize(t, vocab, idf) for t in test_texts]


In [62]:
w, b = train_svm_sparse(
    X_train,
    y_train,
    dim,
    C=C,
    lr=LR,
    epochs=EPOCHS,
    seed=SEED
)


In [63]:
preds = np.array([predict_sparse(w, b, x) for x in X_test], dtype=np.int32)

acc = (preds == y_test).mean()

tp = int(((preds == 1) & (y_test == 1)).sum())
tn = int(((preds == 0) & (y_test == 0)).sum())
fp = int(((preds == 1) & (y_test == 0)).sum())
fn = int(((preds == 0) & (y_test == 1)).sum())

print(f"Akurasi: {acc:.4f}")
print(f"Confusion Matrix → TP={tp}, TN={tn}, FP={fp}, FN={fn}")


Akurasi: 0.7664
Confusion Matrix → TP=449, TN=1119, FP=100, FN=378


In [64]:
# Prediksi data test
preds = np.array([predict_sparse(w, b, x) for x in X_test], dtype=np.int32)

# Buat DataFrame hasil test
df_test = df.iloc[test_idx].copy()

# Tambahkan kolom predicted_label
df_test["predicted_label"] = preds


In [65]:
df_test[["komentar_clean", "label", "predicted_label"]].head(50)


Unnamed: 0,komentar_clean,label,predicted_label
2331,mending ditabung buat beli tanah atau sawah te...,0,0
2744,aamiin,0,0
8052,udah ga ada harapan lagi sama si wowo dan fufu...,0,0
7605,wowkwowkok si beler lagi di singapur,0,0
879,org mendang mending mikir 100x,0,0
4937,busett cerahh banget SGI88 encess koinya ngasi...,1,1
4652,DOrA 77 menawarkan platform stabil andal,1,0
4888,emang gak ada yang se terpercaya alexis bintan...,1,1
151,uang rakyat gk nih wajah_gembira_berurai_air_mata,0,0
613,emang gak se hits alexis17 sih,1,0


In [66]:
acc = (df_test["label"] == df_test["predicted_label"]).mean()

tp = ((df_test["predicted_label"] == 1) & (df_test["label"] == 1)).sum()
tn = ((df_test["predicted_label"] == 0) & (df_test["label"] == 0)).sum()
fp = ((df_test["predicted_label"] == 1) & (df_test["label"] == 0)).sum()
fn = ((df_test["predicted_label"] == 0) & (df_test["label"] == 1)).sum()

print(f"Akurasi : {acc:.4f}")
print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}")


Akurasi : 0.7664
TP=449, TN=1119, FP=100, FN=378


In [67]:
recall_judol = tp / (tp + fn + 1e-9)
precision_judol = tp / (tp + fp + 1e-9)

print("Recall Judol   :", recall_judol)
print("Precision Judol:", precision_judol)


Recall Judol   : 0.5429262394189324
Precision Judol: 0.817850637521279


In [68]:
df_test[
    (df_test["label"] == 0) & (df_test["predicted_label"] == 1)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
3267,gua berasa nonton yt yang 2 jam ngga ngapa2in,0,1
9475,genius banget cok makin gak nyesel gua gak mil...,0,1
7739,baru kelar lgsg crash ya mas sayang bener,0,1
9311,mana ada yang benar broh wkwk kocak,0,1
8956,dihina terlalu hina dipuji ternyata namanya bu...,0,1
6220,kek lebih nyambung ngobrol sama toddler,0,1
6173,gila byd bagus banget jadinya,0,1
7726,yakin ntar dinasehatin sama yang expert dijawa...,0,1
10104,yang kesini cuma untuk dislike dan liat komena...,0,1
637,13 59 us banget ya ternyata di us pakek sni ju...,0,1


In [69]:
df_test[
    (df_test["label"] == 1) & (df_test["predicted_label"] == 0)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
4652,DOrA 77 menawarkan platform stabil andal,1,0
613,emang gak se hits alexis17 sih,1,0
8116,a l e x i s 1 7 bikin ketagihan pengen main pa...,1,0
595,maen swert bonanja sketer terus terusan,1,0
7793,semangat bosqu terbaik WETON88,1,0
6666,modal alexis17 bermanfaat,1,0
2252,kakak bos untung вOꮤO77 perfect langsung,1,0
1641,modal kecil hasil melimpah pastinya hanya di M...,1,0
9667,gak skill marketing sukses alexis17,1,0
4758,bang gamain diga ru da ho ki꧂,1,0


In [70]:
df_test[
    (df_test["label"] == 1) & (df_test["predicted_label"] == 1)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
4937,busett cerahh banget SGI88 encess koinya ngasi...,1,1
4888,emang gak ada yang se terpercaya alexis bintan...,1,1
3052,jepe terus tiap hari situs terbaik,1,1
6846,bener bener situs terpercaya recommended,1,1
611,layanannya di AxL777juga oke banget gak ada ma...,1,1
84,jepey gmpng banget SGI88,1,1
6771,temen temen gue pindah PULAUWIN gue ngerti ken...,1,1
2249,buset videonya sih gokil bintang_medium_putih ...,1,1
4450,gua sih udah anggep alexis 17 yin_yang kaya at...,1,1
7849,mantap bermain SGI88 rezeki mengalir deras,1,1
