In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier

# -------------------------------
# 1. Load TF-IDF dan Label
# -------------------------------

# TF-IDF hasil preprocessing teks Hadis
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f)

# Label multi-label dari preprocessed_training.csv
df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']]

# -------------------------------
# 2. Inisialisasi Model SGD
# -------------------------------

# Model SGD tanpa regularisasi (sesuai perhitungan manual)
base_model = SGDClassifier(
    loss='log_loss',          # Logistic Regression
    penalty=None,             # Tanpa regularisasi
    learning_rate='constant', # Learning rate tetap
    eta0=0.1,                # Learning rate = 0.1 (besar)
    max_iter=1000,           # Jumlah iterasi
    tol=1e-3,                # Tolerance untuk stopping
    random_state=42,
    n_iter_no_change=5       # Berhenti jika loss tidak membaik
)

# Multi-label classification
model = MultiOutputClassifier(base_model)

# -------------------------------
# 3. Latih Model
# -------------------------------
model.fit(X_train_tfidf, y_train)

# -------------------------------
# 4. Simpan Model
# -------------------------------
with open("../models/sgd_logistic_regression_manual.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!")

# -------------------------------
# 5. Tampilkan Koefisien dan Bias
# -------------------------------

labels = ['anjuran', 'larangan', 'informasi']

for i, label in enumerate(labels):
    print(f"\n🧠 Koefisien (w) untuk label '{label}':")
    print(model.estimators_[i].coef_)

    print(f"📍 Bias (intercept) untuk label '{label}':")
    print(model.estimators_[i].intercept_)

# -------------------------------
# 6. Tampilkan Top Fitur Positif
# -------------------------------

# Muat kembali TF-IDF vectorizer untuk melihat nama fitur
with open("../data/tfidf/tfidf_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

feature_names = np.array(vectorizer.get_feature_names_out())

for i, label in enumerate(labels):
    coefs = model.estimators_[i].coef_[0]
    top_features = np.argsort(coefs)[-10:]  # ambil 10 fitur bobot tertinggi

    print(f"\n🔥 Top fitur dengan bobot tertinggi untuk label '{label}':")
    for j in reversed(top_features):
        print(f"{feature_names[j]}: {coefs[j]:.4f}")


✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!

🧠 Koefisien (w) untuk label 'anjuran':
[[-0.21507665 -0.21872163 -0.08740297 ...  0.80266836 -0.24250613
   0.40998159]]
📍 Bias (intercept) untuk label 'anjuran':
[-2.60313394]

🧠 Koefisien (w) untuk label 'larangan':
[[-0.08787105 -0.03728191 -0.01921851 ... -0.29628719 -0.15003491
   0.75121565]]
📍 Bias (intercept) untuk label 'larangan':
[-2.52777102]

🧠 Koefisien (w) untuk label 'informasi':
[[0.03352161 0.02720881 0.00293403 ... 0.12829215 0.0865273  0.00899189]]
📍 Bias (intercept) untuk label 'informasi':
[1.91864695]

🔥 Top fitur dengan bobot tertinggi untuk label 'anjuran':
hendak: 11.9441
perintah: 9.1038
shalatlah: 8.4555
gerhana: 7.1419
mudah: 6.8787
itikaf: 6.4911
sembelih: 6.4654
thawaf: 6.2592
sabda: 6.1141
ilah: 5.8281

🔥 Top fitur dengan bobot tertinggi untuk label 'larangan':
jangan: 24.8525
larang: 15.0944
laknat: 7.3474
khamr: 4.5333
matta: 4.2838
mahram: 4.2652
haid: 4.1622
barangsiap

In [None]:
decision_scores = clf.decision_function(X)
# Kemudian terapkan threshold pilihan Anda
predictions = (decision_scores > your_threshold).astype(int)

In [5]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier

# -------------------------------
# 1. Load TF-IDF dan Label
# -------------------------------

# TF-IDF hasil preprocessing teks Hadis
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f)

# Label multi-label dari preprocessed_training.csv
df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']]

# -------------------------------
# 2. Inisialisasi Model SGD
# -------------------------------


# Inisialisasi model Logistic Regression menggunakan Stochastic Gradient Descent (SGD)
base_model = SGDClassifier(
    loss='log_loss',       # Menggunakan fungsi loss 'log_loss' (logistic regression), cocok untuk klasifikasi biner atau multi-label
    penalty='l2',          # Menambahkan regularisasi L2 (ridge), mencegah overfitting dengan menghukum bobot besar
    alpha=0.0001,          # Koefisien regularisasi (semakin besar nilainya, semakin kuat regularisasinya)
    learning_rate='optimal',  # Metode penyesuaian learning rate secara otomatis berdasarkan jumlah sampel dan alpha
    max_iter=1000,         # Maksimum jumlah iterasi (epoch) pelatihan hingga konvergen
    tol=1e-3,              # Toleransi perubahan loss minimum untuk menentukan apakah pelatihan bisa dihentikan lebih awal
    random_state=42,       # Seed acak untuk hasil yang konsisten/reproducible
    n_iter_no_change=5     # Jumlah iterasi berturut-turut tanpa perbaikan sebelum pelatihan dihentikan (early stopping)
)

# Multi-label classification
model = MultiOutputClassifier(base_model)

# -------------------------------
# 3. Latih Model
# -------------------------------
model.fit(X_train_tfidf, y_train)

# -------------------------------
# 4. Simpan Model
# -------------------------------
with open("../models/sgd_logistic_regression.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!")

# -------------------------------
# 5. Tampilkan Koefisien dan Bias
# -------------------------------

labels = ['anjuran', 'larangan', 'informasi']

for i, label in enumerate(labels):
    print(f"\n🧠 Koefisien (w) untuk label '{label}':")
    print(model.estimators_[i].coef_)

    print(f"📍 Bias (intercept) untuk label '{label}':")
    print(model.estimators_[i].intercept_)

# -------------------------------
# 6. Tampilkan Top Fitur Positif
# -------------------------------

# Muat kembali TF-IDF vectorizer untuk melihat nama fitur
with open("../data/tfidf/tfidf_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

feature_names = np.array(vectorizer.get_feature_names_out())

for i, label in enumerate(labels):
    coefs = model.estimators_[i].coef_[0]
    top_features = np.argsort(coefs)[-10:]  # ambil 10 fitur bobot tertinggi

    print(f"\n🔥 Top fitur dengan bobot tertinggi untuk label '{label}':")
    for j in reversed(top_features):
        print(f"{feature_names[j]}: {coefs[j]:.4f}")


✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!

🧠 Koefisien (w) untuk label 'anjuran':
[[-0.06431844 -0.05552256 -0.02818914 ...  0.25512517 -0.09415079
   0.10959668]]
📍 Bias (intercept) untuk label 'anjuran':
[-2.14814773]

🧠 Koefisien (w) untuk label 'larangan':
[[-0.03390337 -0.01387787 -0.01445578 ... -0.1206214  -0.06509765
   0.32282746]]
📍 Bias (intercept) untuk label 'larangan':
[-2.76516524]

🧠 Koefisien (w) untuk label 'informasi':
[[0.01289376 0.01237488 0.00323303 ... 0.05227713 0.03112194 0.00508185]]
📍 Bias (intercept) untuk label 'informasi':
[2.91240051]

🔥 Top fitur dengan bobot tertinggi untuk label 'anjuran':
hendak: 7.2544
perintah: 5.2885
kalian: 4.0530
shalatlah: 4.0402
sembelih: 3.6208
gerhana: 3.5681
sabda: 3.2851
ilah: 2.9966
itikaf: 2.8546
mudah: 2.8257

🔥 Top fitur dengan bobot tertinggi untuk label 'larangan':
jangan: 14.5552
larang: 9.5180
laknat: 4.1748
kalian: 3.1521
barangsiapa: 2.8233
khamr: 2.6651
haram: 2.5699
jual:

In [3]:
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# -------------------------------
# 1. Load Data TF-IDF dan Label
# -------------------------------
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f)

df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']]

# Konversi ke Tensor
X = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
Y = torch.tensor(y_train.values, dtype=torch.float32)

# -------------------------------
# 2. Model Logistic Regression (Multi-label)
# -------------------------------
class MultiLabelLogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiLabelLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))

input_dim = X.shape[1]
output_dim = Y.shape[1]
model = MultiLabelLogisticRegression(input_dim, output_dim)

# -------------------------------
# 3. Loss & Optimizer dengan pos_weight
# -------------------------------
# Hitung pos_weight = negatif / positif per label
pos_counts = Y.sum(dim=0)
neg_counts = Y.shape[0] - pos_counts
pos_weight = neg_counts / (pos_counts + 1e-5)  # hindari div by zero

# Buat loss function dengan pos_weight
criterion = nn.BCELoss(reduction='mean')  # BCELoss tidak langsung pakai pos_weight
optimizer = optim.SGD(model.parameters(), lr=0.1)

# -------------------------------
# 4. Training
# -------------------------------
epochs = 1000
for epoch in range(epochs):
    outputs = model(X)
    # Terapkan pos_weight manual: loss per label dikalikan pos_weight
    loss = -(pos_weight * Y * torch.log(outputs + 1e-9) + (1 - Y) * torch.log(1 - outputs + 1e-9)).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch} - Loss: {loss.item():.4f}")

# -------------------------------
# 5. Evaluasi
# -------------------------------
with torch.no_grad():
    y_pred_probs = model(X).numpy()
    threshold = 0.4  # ubah threshold agar minoritas bisa tertangkap
    y_pred_labels = (y_pred_probs >= threshold).astype(int)
    y_true = Y.numpy().astype(int)

labels = ['anjuran', 'larangan', 'informasi']

for i, label in enumerate(labels):
    acc = accuracy_score(y_true[:, i], y_pred_labels[:, i])
    prec = precision_score(y_true[:, i], y_pred_labels[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred_labels[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred_labels[:, i], zero_division=0)
    cm = confusion_matrix(y_true[:, i], y_pred_labels[:, i])

    print(f"\n📊 Evaluasi - Label: {label}")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall   : {rec:.4f}")
    print(f"  F1-Score : {f1:.4f}")
    print(f"🧩 Confusion Matrix untuk label '{label}':\n{cm}")

# -------------------------------
# 6. Koefisien dan Bias
# -------------------------------
weights = model.linear.weight.data.numpy()
biases = model.linear.bias.data.numpy()

for i, label in enumerate(labels):
    print(f"\n🧠 Koefisien (W) untuk label '{label}':")
    print(weights[i])
    print(f"📍 Bias (b) untuk label '{label}': {biases[i]:.4f}")


Epoch 0 - Loss: 0.7964
Epoch 100 - Loss: 0.7914
Epoch 200 - Loss: 0.7865
Epoch 300 - Loss: 0.7818
Epoch 400 - Loss: 0.7772
Epoch 500 - Loss: 0.7727
Epoch 600 - Loss: 0.7683
Epoch 700 - Loss: 0.7640
Epoch 800 - Loss: 0.7598
Epoch 900 - Loss: 0.7557

📊 Evaluasi - Label: anjuran
  Accuracy : 0.1985
  Precision: 0.1985
  Recall   : 1.0000
  F1-Score : 0.3313
🧩 Confusion Matrix untuk label 'anjuran':
[[   0 4489]
 [   0 1112]]

📊 Evaluasi - Label: larangan
  Accuracy : 0.1200
  Precision: 0.1186
  Recall   : 1.0000
  F1-Score : 0.2120
🧩 Confusion Matrix untuk label 'larangan':
[[   9 4929]
 [   0  663]]

📊 Evaluasi - Label: informasi
  Accuracy : 0.9591
  Precision: 0.9591
  Recall   : 1.0000
  F1-Score : 0.9791
🧩 Confusion Matrix untuk label 'informasi':
[[   0  229]
 [   0 5372]]

🧠 Koefisien (W) untuk label 'anjuran':
[-0.00210054 -0.00228587  0.00463191 ...  0.00442542 -0.00565291
 -0.00100977]
📍 Bias (b) untuk label 'anjuran': -0.0338

🧠 Koefisien (W) untuk label 'larangan':
[ 0.002832