jgn rerun all

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log
import random


In [3]:
# Konfigurasi
DATA_PATH   = "dataset/train_clean_label.csv"
TEST_RATIO  = 0.2
MIN_FREQ    = 2
MAX_FEATURES = 30000
C           = 1.0
EPOCHS      = 30
LR          = 0.1
SEED        = 42


In [4]:
def tokenize(s: str):
    return str(s).split()


In [5]:
def build_vocab(texts, min_freq=MIN_FREQ, max_features=MAX_FEATURES):
    freq = Counter()
    for t in texts:
        freq.update(tokenize(t))

    items = [(tok, c) for tok, c in freq.items() if c >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)

    vocab = {tok: i for i, (tok, _) in enumerate(items[:max_features])}
    return vocab


In [6]:
def compute_idf(texts, vocab):
    N = len(texts)
    df = Counter()

    for t in texts:
        seen = set()
        for tok in tokenize(t):
            if tok in vocab and tok not in seen:
                df[tok] += 1
                seen.add(tok)

    idf = {
        tok: log((N + 1) / (df[tok] + 1)) + 1.0
        for tok in vocab.keys()
    }
    return idf


In [7]:
def vectorize(text, vocab, idf):
    cnt = Counter(tokenize(text))
    x = {}
    for tok, tf in cnt.items():
        idx = vocab.get(tok)
        if idx is not None:
            x[idx] = tf * idf.get(tok, 1.0)
    return x


In [8]:
def dot_sparse(w, x):
    return sum(w[j] * v for j, v in x.items())


In [9]:
def train_svm_sparse(X, y, dim, C=1.0, lr=0.1, epochs=10, seed=SEED):
    random.seed(seed)
    w = np.zeros(dim, dtype=np.float32)
    b = 0.0

    for ep in range(epochs):
        idxs = list(range(len(X)))
        random.shuffle(idxs)

        for i in idxs:
            xi, yi = X[i], y[i]

            # Regularisasi L2 sederhana
            w *= (1.0 - lr)

            margin = yi * (dot_sparse(w, xi) + b)

            if margin < 1.0:
                for j, v in xi.items():
                    w[j] += lr * C * yi * v
                b += lr * C * yi

        lr *= 0.9

    return w, b


In [10]:
def predict_sparse(w, b, xi):
    score = dot_sparse(w, xi) + b
    return 1 if score >= 0.0 else 0


In [11]:
df = pd.read_csv(DATA_PATH)

texts  = df["komentar_clean"].astype(str).tolist()
labels = df["label"].astype(int).tolist()


In [12]:
n = len(texts)
idxs = list(range(n))

random.seed(SEED)
random.shuffle(idxs)

split = int(n * (1.0 - TEST_RATIO))
train_idx = idxs[:split]
test_idx  = idxs[split:]


In [13]:
train_texts = [texts[i] for i in train_idx]
test_texts  = [texts[i] for i in test_idx]

y_train = np.array([1 if labels[i] == 1 else -1 for i in train_idx], dtype=np.int32)
y_test  = np.array([labels[i] for i in test_idx], dtype=np.int32)


In [14]:
vocab = build_vocab(train_texts, MIN_FREQ, MAX_FEATURES)
idf   = compute_idf(train_texts, vocab)

dim = len(vocab)
print("Jumlah fitur:", dim)


Jumlah fitur: 3799


In [15]:
X_train = [vectorize(t, vocab, idf) for t in train_texts]
X_test  = [vectorize(t, vocab, idf) for t in test_texts]


In [16]:
w, b = train_svm_sparse(
    X_train,
    y_train,
    dim,
    C=C,
    lr=LR,
    epochs=EPOCHS,
    seed=SEED
)


In [17]:
preds = np.array([predict_sparse(w, b, x) for x in X_test], dtype=np.int32)

acc = (preds == y_test).mean()

tp = int(((preds == 1) & (y_test == 1)).sum())
tn = int(((preds == 0) & (y_test == 0)).sum())
fp = int(((preds == 1) & (y_test == 0)).sum())
fn = int(((preds == 0) & (y_test == 1)).sum())

print(f"Akurasi: {acc:.4f}")
print(f"Confusion Matrix → TP={tp}, TN={tn}, FP={fp}, FN={fn}")


Akurasi: 0.8358
Confusion Matrix → TP=499, TN=1211, FP=8, FN=328


In [18]:
# Prediksi data test
preds = np.array([predict_sparse(w, b, x) for x in X_test], dtype=np.int32)

# Buat DataFrame hasil test
df_test = df.iloc[test_idx].copy()

# Tambahkan kolom predicted_label
df_test["predicted_label"] = preds


In [37]:
df_test[["komentar_clean", "label", "predicted_label"]].head(20)


Unnamed: 0,komentar_clean,label,predicted_label
2331,mending ditabung buat beli tanah atau sawah te...,0,0
2744,aamiin,0,0
8052,udah ga ada harapan lagi sama si wowo dan fufu...,0,0
7605,wowkwowkok si beler lagi di singapur,0,0
879,org mendang mending mikir 100x,0,0
4937,busett cerahh banget SGI88 encess koinya ngasi...,1,1
4652,DOrA 77 menawarkan platform stabil andal,1,0
4888,emang gak ada yang se terpercaya alexis bintan...,1,1
151,uang rakyat gk nih wajah_gembira_berurai_air_mata,0,0
613,emang gak se hits alexis17 sih,1,1


In [20]:
acc = (df_test["label"] == df_test["predicted_label"]).mean()

tp = ((df_test["predicted_label"] == 1) & (df_test["label"] == 1)).sum()
tn = ((df_test["predicted_label"] == 0) & (df_test["label"] == 0)).sum()
fp = ((df_test["predicted_label"] == 1) & (df_test["label"] == 0)).sum()
fn = ((df_test["predicted_label"] == 0) & (df_test["label"] == 1)).sum()

print(f"Akurasi : {acc:.4f}")
print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}")


Akurasi : 0.8358
TP=499, TN=1211, FP=8, FN=328


In [21]:
recall_judol = tp / (tp + fn + 1e-9)
precision_judol = tp / (tp + fp + 1e-9)

print("Recall Judol   :", recall_judol)
print("Precision Judol:", precision_judol)


Recall Judol   : 0.6033857315591253
Precision Judol: 0.9842209072958892


In [22]:
df_test[
    (df_test["label"] == 0) & (df_test["predicted_label"] == 1)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
9475,genius banget cok makin gak nyesel gua gak mil...,0,1
9304,tidak pernah merasa jenuh bermain d O я A 7 7 ...,0,1
4183,pengen banget nih dapat hadiah wajah_menangis,0,1
6180,`` nggak belajar ramah sombong bener boxer boy...,0,1
6882,bagus banget emang kaca filem wincos hati_mera...,0,1
9433,denishenritowoliu5367 gk usah kasian sama w em...,0,1
2267,wuiiiih mantap banget nich aku suka banget dec...,0,1
4092,hasilnya bagus banget sumpah gak pernah gagal ...,0,1


In [23]:
df_test[
    (df_test["label"] == 1) & (df_test["predicted_label"] == 0)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
4652,DOrA 77 menawarkan platform stabil andal,1,0
595,maen swert bonanja sketer terus terusan,1,0
611,layanannya di AxL777juga oke banget gak ada ma...,1,0
6666,modal alexis17 bermanfaat,1,0
2252,kakak bos untung вOꮤO77 perfect langsung,1,0
1641,modal kecil hasil melimpah pastinya hanya di M...,1,0
4758,bang gamain diga ru da ho ki꧂,1,0
3396,petir merah muncul auto makswin,1,0
7837,gass ✦Manut88✦,1,0
2481,gak nyangka bisa menang sebanyak ini 1 16 di m...,1,0


In [24]:
df_test[
    (df_test["label"] == 1) & (df_test["predicted_label"] == 1)
][["komentar_clean", "label", "predicted_label"]].head(10)


Unnamed: 0,komentar_clean,label,predicted_label
4937,busett cerahh banget SGI88 encess koinya ngasi...,1,1
4888,emang gak ada yang se terpercaya alexis bintan...,1,1
613,emang gak se hits alexis17 sih,1,1
3052,jepe terus tiap hari situs terbaik,1,1
8116,a l e x i s 1 7 bikin ketagihan pengen main pa...,1,1
6846,bener bener situs terpercaya recommended,1,1
7793,semangat bosqu terbaik WETON88,1,1
84,jepey gmpng banget SGI88,1,1
6771,temen temen gue pindah PULAUWIN gue ngerti ken...,1,1
2249,buset videonya sih gokil bintang_medium_putih ...,1,1


In [28]:
df_yt = pd.read_csv("dataset/dataset_youtube_clean.csv")
# df_yt.head()
texts_yt = df_yt["komentar_clean"].astype(str).tolist()

X_yt = [vectorize(t, vocab, idf) for t in texts_yt]
df_yt["predicted_label"] = [
    predict_sparse(w, b, x) for x in X_yt
]
# df_yt.head(20)

In [29]:
df_yt["predicted_label"].value_counts()


predicted_label
0    2566
1     110
Name: count, dtype: int64

In [32]:
# df_yt[df_yt["predicted_label"] == 1][["komentar_clean"]].head(20)
df_yt[df_yt["predicted_label"] == 1][["komentar_clean"]].sample(20)



Unnamed: 0,komentar_clean
2192,ga ru da ho ki꧂lisensi web terbaik
1225,"main santai menang nyata cuma di"" ikan_tropis ..."
2385,"ga berisik ga koar"" salut h a u s w i n 138 be..."
1981,wuihhh udah lama nih gak makan di sei sapi lam...
2377,kak daftar disni bukan janji manis doang g a r...
2035,alhamdulillah mukbang yg sangat nikmat banget ...
1742,cuman mo bilang makasih udh ngasih tauga ru da...
2216,ga ru da ho ki꧂member baru pasti jepee
652,udah bang udah ngiler banget ngiler ngiler
2193,ga ru da ho ki꧂member baru pasti jepee


In [35]:
SEED = 42

df_judol_100 = df_yt[df_yt["predicted_label"] == 1].sample(
    n=100,
    random_state=SEED
)
df_nonjudol_100 = df_yt[df_yt["predicted_label"] == 0].sample(
    n=100,
    random_state=SEED
)

df_sample_200 = pd.concat(
    [df_judol_100, df_nonjudol_100],
    ignore_index=True
)
df_sample_200 = df_sample_200.sample(
    frac=1.0,
    random_state=SEED
).reset_index(drop=True)


In [36]:
OUTPUT_SAMPLE = "dataset/datasetYT_predicted.csv"

df_sample_200.to_csv(
    OUTPUT_SAMPLE,
    index=False,
    encoding="utf-8"
)

