### **1. Import Library**
# Mengimpor pustaka yang diperlukan

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

### **2. Load Data**
# Memuat data yang diperlukan


In [2]:
# Load TF-IDF training data
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_train_tfidf = pickle.load(f)

### **3. Data Preprocessing**
# mengambil data label setiap dokumen

In [4]:
# Load labels
df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
y_train = df_train[['anjuran', 'larangan', 'informasi']]

def display_table(df, title):
    styled_df = df.head(3).style.set_properties(**{"text-align": "left"}).set_caption(title)
    display(styled_df)

display_table(y_train, "Training Data")


Unnamed: 0,anjuran,larangan,informasi
0,1,1,1
1,0,0,1
2,0,1,1


### 4. **Logistic Regression dalam MultiOutputClassifier**
# Membuat model Logistic Regression dalam MultiOutputClassifier

In [5]:
# Inisialisasi dan latih model Logistic Regression dalam MultiOutputClassifier
base_model = LogisticRegression(max_iter=1000)
model = MultiOutputClassifier(base_model)
model.fit(X_train_tfidf, y_train)

### **5. Simpan Model**
# Menyimpan model yang telah dibuat

In [11]:
# Simpan model
with open("../models/logistic_regression.pkl", "wb") as f:
    pickle.dump(model, f)

print("âœ… Model Multi-Label Logistic Regression berhasil dilatih dan disimpan!")

âœ… Model Multi-Label Logistic Regression berhasil dilatih dan disimpan!


In [26]:
import math
import pickle
import pandas as pd
import numpy as np

# --- STEP 1: Load TF-IDF and labels ---
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X = pickle.load(f).toarray()  # Ubah ke array biasa

df_train = pd.read_csv("../data/processed/preprocessed_training.csv", delimiter=";")
Y_multi = df_train[['anjuran', 'larangan', 'informasi']].values

m, n = X.shape
k = Y_multi.shape[1]  # Jumlah label

# --- STEP 2: Sigmoid Function ---
def sigmoid(z):
    return 1 / (1 + math.exp(-z))

# --- STEP 3: Cost Function ---
def compute_cost(Y, Yp):
    total = 0
    for i in range(len(Y)):
        total += Y[i]*math.log(Yp[i] + 1e-15) + (1 - Y[i])*math.log(1 - Yp[i] + 1e-15)
    return (-1/len(Y)) * total

# --- STEP 4: Transpose & Helpers ---
def transpose(X):
    return list(map(list, zip(*X)))

def dot(a, b):
    return sum(i * j for i, j in zip(a, b))

def subtract(a, b):
    return [i - j for i, j in zip(a, b)]

def multiply(a, b):  # Scalar * vector
    return [a * i for i in b]

def sum_vector(vec):
    return sum(vec)

# --- STEP 5: Gradients ---
def compute_dW(X, Y, Yp):
    Xt = transpose(X)
    delta = subtract(Yp, Y)
    dW = [(-1/len(Y)) * dot(Xt[j], delta) for j in range(len(Xt))]
    return dW

def compute_db(Y, Yp):
    delta = subtract(Yp, Y)
    return (-1/len(Y)) * sum(delta)

# --- STEP 6: Training Per Label ---
def train_logistic_regression(X, Y, alpha=0.1, epochs=100):
    m, n = len(X), len(X[0])
    W = [0.0 for _ in range(n)]
    b = 0.0

    for epoch in range(epochs):
        Yp = []
        for i in range(m):
            z = dot(X[i], W) + b
            Yp.append(sigmoid(z))
        
        dW = compute_dW(X, Y, Yp)
        db = compute_db(Y, Yp)

        W = subtract(W, multiply(alpha, dW))
        b -= alpha * db

    return W, b

# --- STEP 7: Predict with Threshold ---
def predict(X, W, b):
    preds = []
    for i in range(len(X)):
        z = dot(X[i], W) + b
        p = sigmoid(z)
        preds.append(1 if p > 0.5 else 0)
    return preds

# --- STEP 8: Confusion Matrix and Metrics ---
def confusion(Y, Yp):
    tp = tn = fp = fn = 0
    for i in range(len(Y)):
        if Y[i] == 1 and Yp[i] == 1:
            tp += 1
        elif Y[i] == 0 and Yp[i] == 0:
            tn += 1
        elif Y[i] == 0 and Yp[i] == 1:
            fp += 1
        elif Y[i] == 1 and Yp[i] == 0:
            fn += 1
    return tp, tn, fp, fn

def calculate_metrics(tp, tn, fp, fn):
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    return accuracy, precision, recall, f1

# --- STEP 9: Training Multi-label ---
label_names = ['anjuran', 'larangan', 'informasi']
models = []

for i in range(k):
    print(f"\nTraining model for label: {label_names[i]}")
    Y = Y_multi[:, i].tolist()
    W, b = train_logistic_regression(X, Y, alpha=0.1, epochs=100)
    Y_pred = predict(X, W, b)
    tp, tn, fp, fn = confusion(Y, Y_pred)
    acc, prec, rec, f1 = calculate_metrics(tp, tn, fp, fn)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")

    models.append((W, b))  # Simpan model

print("\nâœ… Training selesai untuk semua label.")



Training model for label: anjuran
Accuracy:  0.1985
Precision: 0.1985
Recall:    1.0000
F1-Score:  0.3313

Training model for label: larangan
Accuracy:  0.1184
Precision: 0.1184
Recall:    1.0000
F1-Score:  0.2117

Training model for label: informasi
Accuracy:  0.0409
Precision: 0.0000
Recall:    0.0000
F1-Score:  0.0000

âœ… Training selesai untuk semua label.


In [28]:
# Simpan model per label ke file
for i, (W, b) in enumerate(models):
    with open(f'../models/logreg_model_{label_names[i]}.pkl', 'wb') as f:
        pickle.dump({'W': W, 'b': b}, f)

print("\nâœ… Model berhasil disimpan ke folder '../model/'")


âœ… Model berhasil disimpan ke folder '../model/'


In [30]:
# Load TF-IDF untuk data testing
with open("../data/tfidf/tfidf_training.pkl", "rb") as f:
    X_test = pickle.load(f).toarray()

# Load label sebenarnya
df_test = pd.read_csv("../data/processed/preprocessed_testing.csv", delimiter=";")
Y_test_multi = df_test[['anjuran', 'larangan', 'informasi']].values


In [32]:
print("\nðŸ”Ž Evaluasi Model di Data Testing")
for i in range(k):
    label = label_names[i]
    with open(f'../models/logreg_model_{label}.pkl', 'rb') as f:
        model = pickle.load(f)
    W, b = model['W'], model['b']
    
    Y_test = Y_test_multi[:, i].tolist()
    Y_pred = predict(X_test, W, b)
    
    tp, tn, fp, fn = confusion(Y_test, Y_pred)
    acc, prec, rec, f1 = calculate_metrics(tp, tn, fp, fn)

    print(f"\nLabel: {label}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")



ðŸ”Ž Evaluasi Model di Data Testing

Label: anjuran
Accuracy:  0.1593
Precision: 0.1593
Recall:    1.0000
F1-Score:  0.2748

Label: larangan
Accuracy:  0.1300
Precision: 0.1300
Recall:    1.0000
F1-Score:  0.2301

Label: informasi
Accuracy:  0.0986
Precision: 0.0000
Recall:    0.0000
F1-Score:  0.0000
