In [None]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier

# -------------------------------
# 1. Load TF-IDF dan Label
# -------------------------------

# TF-IDF hasil preprocessing teks Hadis
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f)

# Label multi-label dari preprocessed_training.csv
df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']]

# -------------------------------
# 2. Inisialisasi Model SGD
# -------------------------------

# Model Logistic Regression berbasis SGD (tanpa regularisasi)
base_model = SGDClassifier(
    loss='log_loss',          # logistic regression loss
    penalty=None,           # tanpa regularisasi (mirip rumus manual)
    learning_rate='constant',
    eta0=0.1,                 # learning rate (α)
    max_iter=1000,
    tol=1e-3,
    random_state=42
)

# Multi-label classification
model = MultiOutputClassifier(base_model)

# -------------------------------
# 3. Latih Model
# -------------------------------
model.fit(X_train_tfidf, y_train)

# -------------------------------
# 4. Simpan Model
# -------------------------------
with open("../models/sgd_logistic_regression.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!")

# -------------------------------
# 5. Tampilkan Koefisien dan Bias
# -------------------------------

labels = ['anjuran', 'larangan', 'informasi']

for i, label in enumerate(labels):
    print(f"\n🧠 Koefisien (w) untuk label '{label}':")
    print(model.estimators_[i].coef_)

    print(f"📍 Bias (intercept) untuk label '{label}':")
    print(model.estimators_[i].intercept_)

# -------------------------------
# 6. Tampilkan Top Fitur Positif
# -------------------------------

# Muat kembali TF-IDF vectorizer untuk melihat nama fitur
with open("../data/tfidf/tfidf_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

feature_names = np.array(vectorizer.get_feature_names_out())

for i, label in enumerate(labels):
    coefs = model.estimators_[i].coef_[0]
    top_features = np.argsort(coefs)[-10:]  # ambil 10 fitur bobot tertinggi

    print(f"\n🔥 Top fitur dengan bobot tertinggi untuk label '{label}':")
    for j in reversed(top_features):
        print(f"{feature_names[j]}: {coefs[j]:.4f}")


✅ Model Multi-Label Logistic Regression dengan SGD berhasil dilatih dan disimpan!

🧠 Koefisien (w) untuk label 'anjuran':
[[-0.21507665 -0.21872163 -0.08740297 ...  0.80266836 -0.24250613
   0.40998159]]
📍 Bias (intercept) untuk label 'anjuran':
[-2.60313394]

🧠 Koefisien (w) untuk label 'larangan':
[[-0.08787105 -0.03728191 -0.01921851 ... -0.29628719 -0.15003491
   0.75121565]]
📍 Bias (intercept) untuk label 'larangan':
[-2.52777102]

🧠 Koefisien (w) untuk label 'informasi':
[[0.03352161 0.02720881 0.00293403 ... 0.12829215 0.0865273  0.00899189]]
📍 Bias (intercept) untuk label 'informasi':
[1.91864695]

🔥 Top fitur dengan bobot tertinggi untuk label 'anjuran':
hendak: 11.9441
perintah: 9.1038
shalatlah: 8.4555
gerhana: 7.1419
mudah: 6.8787
itikaf: 6.4911
sembelih: 6.4654
thawaf: 6.2592
sabda: 6.1141
ilah: 5.8281

🔥 Top fitur dengan bobot tertinggi untuk label 'larangan':
jangan: 24.8525
larang: 15.0944
laknat: 7.3474
khamr: 4.5333
matta: 4.2838
mahram: 4.2652
haid: 4.1622
barangsiap