# Multi-Label Classification with Logistic Regression

This notebook demonstrates a manual implementation of multi-label classification using Logistic Regression, following the approach in the referenced journal. We will use TF-IDF features extracted from Hadith text, and manually train a binary classifier for each label (anjuran, larangan, informasi).

In [18]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

# ============================
# 1. Menyiapkan Data Aktual (X, Y)
# ============================
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f).toarray()  # X = data fitur (hasil TF-IDF)

df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']].values  # Y = label multi-label

# ============================
# 2. Inisialisasi Bobot dan Bias
# ============================
class ManualLogisticRegression:
    def __init__(self, learning_rate=0.1, n_iter=1000):
        self.lr = learning_rate        # α = learning rate
        self.n_iter = n_iter           # jumlah iterasi (epoch)
        self.weights = None            # bobot awal w = 0
        self.bias = None               # bias awal b = 0

    # ============================
    # 4. Menerapkan Fungsi Sigmoid
    # Rumus: sigmoid(z) = 1 / (1 + e^(-z))
    # ============================
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Inisialisasi bobot dan bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # ============================
        # Proses iterasi Gradient Descent
        # ============================
        for _ in range(self.n_iter):

            # ============================
            # 3. Menghitung Prediksi Awal (z = Xw + b)
            # ============================
            linear_pred = np.dot(X, self.weights) + self.bias

            # ============================
            # 4. Menerapkan Sigmoid → Y_pred
            # ============================
            y_pred = self.sigmoid(linear_pred)

            # ============================
            # 5. Menghitung Fungsi Biaya (Opsional, bisa tambahkan loss log loss jika ingin)
            # 6. Menghitung Turunan Parsial (gradien)
            # dw = (1/n) * X.T · (Y_pred - Y)
            # db = (1/n) * jumlah(Y_pred - Y)
            # ============================
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            # ============================
            # 7. Memperbarui Bobot dan Bias
            # w = w - α * dw
            # b = b - α * db
            # ============================
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    # ============================
    # 8. Menghitung Prediksi Akhir
    # Jika sigmoid ≥ 0.5 → 1, jika < 0.5 → 0
    # ============================
    def predict(self, X, threshold=0.5):
        linear_pred = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_pred)
        return (y_pred >= threshold).astype(int)

# ============================
# Melatih model untuk setiap label: anjuran, larangan, informasi
# ============================
models = {}
labels = ['anjuran', 'larangan', 'informasi']

for i, label in enumerate(labels):
    print(f"\n=== Melatih model untuk label: {label} ===")

    # Inisialisasi model
    lr = ManualLogisticRegression(learning_rate=0.01, n_iter=2000)
    lr.fit(X_train_tfidf, y_train[:, i])

    # Simpan model per label
    models[label] = lr

    # Prediksi pada data training
    y_pred = lr.predict(X_train_tfidf)

    # ============================
    # 9. Menghitung Confusion Matrix
    # 10. Menghitung Ukuran Statistik (Presisi, Recall, F1)
    # Menggunakan classification_report dari sklearn
    # ============================
    print(classification_report(
        y_train[:, i],
        y_pred,
        target_names=[f'Tidak {label}', label]
    ))

# ============================
# Menyimpan semua model dalam satu file pickle
# ============================
with open("../models/manual_logreg_models.pkl", "wb") as f:
    pickle.dump(models, f)

# ============================
# Fungsi untuk memprediksi semua label sekaligus
# ============================
def predict_all_labels(X, models, threshold=0.5):
    y_pred = np.zeros((X.shape[0], len(models)))  # [jumlah_data x 3 label]
    for i, label in enumerate(models.keys()):
        y_pred[:, i] = models[label].predict(X, threshold)
    return y_pred

# ============================
# Contoh penggunaan prediksi multi-label
# ============================
y_train_pred = predict_all_labels(X_train_tfidf, models)
print("\nHasil Prediksi untuk semua label:")
print(pd.DataFrame(y_train_pred, columns=labels).head())



=== Melatih model untuk label: anjuran ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

Tidak anjuran       0.80      1.00      0.89      4489
      anjuran       0.00      0.00      0.00      1112

     accuracy                           0.80      5601
    macro avg       0.40      0.50      0.44      5601
 weighted avg       0.64      0.80      0.71      5601


=== Melatih model untuk label: larangan ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

Tidak larangan       0.88      1.00      0.94      4938
      larangan       0.00      0.00      0.00       663

      accuracy                           0.88      5601
     macro avg       0.44      0.50      0.47      5601
  weighted avg       0.78      0.88      0.83      5601


=== Melatih model untuk label: informasi ===
                 precision    recall  f1-score   support

Tidak informasi       0.00      0.00      0.00       229
      informasi       0.96      1.00      0.98      5372

       accuracy                           0.96      5601
      macro avg       0.48      0.50      0.49      5601
   weighted avg       0.92      0.96      0.94      5601


Hasil Prediksi untuk semua label:
   anjuran  larangan  informasi
0      0.0       0.0        1.0
1      0.0       0.0        1.0
2      0.0       0.0        1.0
3      0.0       0.0        1.0
4      0.0       0.0        1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
