In [57]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log
import random

# Konfigurasi
TEST_RATIO   = 0.2
MIN_FREQ     = 2
MAX_FEATURES = 30000
C            = 1.0
EPOCHS       = 30
LR           = 0.1
SEED         = 42


In [58]:

df_train_old = pd.read_csv("dataset/train_clean_label.csv")
df_yt_200 = pd.read_csv(
    "dataset/datasetYT_ManualLabel.csv",
    sep=";"  
)

df_train_old.shape, df_yt_200.shape


((10230, 2), (200, 2))

In [59]:
df_train_all = pd.concat(
    [df_train_old, df_yt_200],
    ignore_index=True
)


df_train_all = df_train_all.sample(
    frac=1.0,
    random_state=SEED
).reset_index(drop=True)

df_train_all["label"].value_counts(), df_train_all.shape


(label
 0    6136
 1    4294
 Name: count, dtype: int64,
 (10430, 2))

In [60]:
# df_train_all.to_csv(
#     "dataset/train_combined_kaggle_style.csv",
#     index=False,
#     encoding="utf-8"
# )


In [61]:
def tokenize(s: str):
    return str(s).split()

def build_vocab(texts, min_freq=MIN_FREQ, max_features=MAX_FEATURES):
    freq = Counter()
    for t in texts:
        freq.update(tokenize(t))
    items = [(tok, c) for tok, c in freq.items() if c >= min_freq]
    items.sort(key=lambda x: x[1], reverse=True)
    vocab = {tok: i for i, (tok, _) in enumerate(items[:max_features])}
    return vocab

def compute_idf(texts, vocab):
    N = len(texts)
    df = Counter()
    for t in texts:
        seen = set()
        for tok in tokenize(t):
            if tok in vocab and tok not in seen:
                df[tok] += 1
                seen.add(tok)
    idf = {
        tok: log((N + 1) / (df[tok] + 1)) + 1.0
        for tok in vocab.keys()
    }
    return idf

def vectorize(text, vocab, idf):
    cnt = Counter(tokenize(text))
    x = {}
    for tok, tf in cnt.items():
        idx = vocab.get(tok)
        if idx is not None:
            x[idx] = tf * idf.get(tok, 1.0)
    return x  # dict: index -> tf-idf

def dot_sparse(w, x):
    return sum(w[j] * v for j, v in x.items())

def train_svm_sparse(X, y, dim, C=1.0, lr=0.1, epochs=10, seed=SEED):
    random.seed(seed)
    w = np.zeros(dim, dtype=np.float32)
    b = 0.0
    for ep in range(epochs):
        idxs = list(range(len(X)))
        random.shuffle(idxs)
        for i in idxs:
            xi, yi = X[i], y[i]
            w *= (1.0 - lr)  # weight decay sederhana
            margin = yi * (dot_sparse(w, xi) + b)
            if margin < 1.0:
                for j, v in xi.items():
                    w[j] += lr * C * yi * v
                b += lr * C * yi
        lr *= 0.9
    return w, b

def predict_sparse(w, b, xi):
    score = dot_sparse(w, xi) + b
    return 1 if score >= 0.0 else 0


In [62]:
texts  = df_train_all["komentar_clean"].astype(str).tolist()
labels = df_train_all["label"].astype(int).tolist()

y = np.array([1 if l == 1 else -1 for l in labels], dtype=np.int32)


In [63]:
n = len(texts)
idxs = list(range(n))

random.seed(SEED)
random.shuffle(idxs)

split = int(n * (1.0 - TEST_RATIO))
train_idx = idxs[:split]
test_idx  = idxs[split:]

train_texts = [texts[i] for i in train_idx]
test_texts  = [texts[i] for i in test_idx]

y_train = y[train_idx]
y_test  = np.array([labels[i] for i in test_idx], dtype=np.int32)


In [64]:
vocab = build_vocab(train_texts, MIN_FREQ, MAX_FEATURES)
idf   = compute_idf(train_texts, vocab)
dim   = len(vocab)

print("Jumlah fitur:", dim)

X_train = [vectorize(t, vocab, idf) for t in train_texts]
X_test  = [vectorize(t, vocab, idf) for t in test_texts]


Jumlah fitur: 3895


In [65]:
w_final, b_final = train_svm_sparse(
    X_train,
    y_train,
    dim,
    C=C,
    lr=LR,
    epochs=EPOCHS,
    seed=SEED
)


In [66]:
preds = np.array(
    [predict_sparse(w_final, b_final, x) for x in X_test],
    dtype=np.int32
)

acc = (preds == y_test).mean()

tp = int(((preds == 1) & (y_test == 1)).sum())
tn = int(((preds == 0) & (y_test == 0)).sum())
fp = int(((preds == 1) & (y_test == 0)).sum())
fn = int(((preds == 0) & (y_test == 1)).sum())

print(f"Akurasi : {acc:.4f}")
print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}")

recall = tp / (tp + fn + 1e-9)
precision = tp / (tp + fp + 1e-9)

print("Recall Judol   :", recall)
print("Precision Judol:", precision)


Akurasi : 0.8864
TP=639, TN=1210, FP=14, FN=223
Recall Judol   : 0.7412993039434556
Precision Judol: 0.9785604900444432


In [68]:
OUTPUT_TRAIN = "dataset/train_final.csv"

df_train_all.to_csv(
    OUTPUT_TRAIN,
    index=False,
    encoding="utf-8"
)
