In [1]:
# ===== tugas6_fixed.py — Random Forest untuk Klasifikasi (robust untuk dataset kecil) =====
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings untuk clean output

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)
import joblib
import os

# ---------- Langkah 1: Muat Data (dengan fallback dummy jika file tidak ada) ----------
csv_file = "processed_kelulusan.csv"
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    print(f"✅ Loaded {csv_file} ({len(df)} rows).")
else:
    print(f"⚠ File {csv_file} tidak ditemukan. Menggunakan dummy data untuk testing.")
    np.random.seed(42)
    n = 200  # Ukuran dummy dataset kecil
    df = pd.DataFrame({
        "IPK": np.random.uniform(2.0, 4.0, n),
        "Jumlah_Absensi": np.random.randint(0, 15, n),
        "Waktu_Belajar_Jam": np.random.uniform(0, 10, n),
        "Rasio_Absensi": np.random.uniform(0, 1, n),
        "IPK_x_Study": np.random.uniform(2.0, 4.0, n) * np.random.uniform(0, 10, n),
        "Lulus": np.random.binomial(1, 0.6, n)  # 60% lulus, sedikit imbalance
    })
    df.to_csv(csv_file, index=False)  # Simpan dummy untuk next run
    print(f"💾 Dummy data disimpan ke {csv_file}.")

X = df.drop("Lulus", axis=1)
y = df["Lulus"]

# split 70/15/15 — stratify sekali + cari seed agar test punya 2 kelas
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

seed_found = None
for rs in range(500):
    X_val_try, X_test_try, y_val_try, y_test_try = train_test_split(
        X_temp, y_temp, test_size=0.50, random_state=rs
    )
    if len(set(y_test_try)) == 2:
        seed_found = rs
        X_val, X_test, y_val, y_test = X_val_try, X_test_try, y_val_try, y_test_try
        break

if seed_found is None:
    print("⚠ Tidak menemukan seed yang bikin test 2 kelas; pakai rs=42.")
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, random_state=42
    )
    seed_found = 42
else:
    print(f"✅ random_state kedua = {seed_found} → test set mengandung 0 & 1.")

# Simpan seed untuk reproducibility
with open("seed.txt", "w") as f:
    f.write(str(seed_found))
print(f"💾 Seed disimpan ke seed.txt.")

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Label count — train:\n", y_train.value_counts())
print("Label count — val:\n",   y_val.value_counts())
print("Label count — test:\n",  y_test.value_counts())

# ---------- Langkah 2: Pipeline & Baseline RF ----------
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(include="object").columns  # Handle kategorikal jika ada

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc",  StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="constant", fill_value="missing")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), cat_cols) if len(cat_cols) > 0 else ("drop", "drop", cat_cols)
], remainder="passthrough")  # Ganti drop ke passthrough jika ada kolom lain

rf = RandomForestClassifier(
    n_estimators=300, max_features="sqrt",
    class_weight="balanced", random_state=42
)

pipe = Pipeline([("pre", pre), ("clf", rf)])
pipe.fit(X_train, y_train)

y_val_pred = pipe.predict(X_val)
baseline_f1 = f1_score(y_val, y_val_pred, average="macro")
print("\nBaseline RF — F1(val):", baseline_f1)
print(classification_report(y_val, y_val_pred, digits=3))

# ---------- Langkah 3: Validasi Silang (pakai 2-fold biar aman) ----------
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="f1_macro", n_jobs=-1)
print("CV F1-macro (train):", cv_scores.mean(), "±", cv_scores.std())

# ---------- Langkah 4: GridSearch Tuning (ringkas + anti-overfit) ----------
param = {
    "clf__max_depth": [None, 12, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4]  # Tambah untuk cegah overfitting di dataset kecil
}
gs = GridSearchCV(pipe, param_grid=param, cv=skf, scoring="f1_macro", n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
best_model = gs.best_estimator_

y_val_best = best_model.predict(X_val)
best_f1 = f1_score(y_val, y_val_best, average="macro")
print("Best RF — F1(val):", best_f1)

# ---------- Langkah 5: Evaluasi Akhir di TEST ----------
final_model = best_model  # Ganti ke pipe jika baseline lebih bagus (cek manual)

y_test_pred = final_model.predict(X_test)
test_f1 = f1_score(y_test, y_test_pred, average="macro")
print("\n=== TEST EVALUATION ===")
print("F1(test):", test_f1)
print(classification_report(y_test, y_test_pred, digits=3))

# Check overfitting: Hitung F1 train
y_train_pred = final_model.predict(X_train)
train_f1 = f1_score(y_train, y_train_pred, average="macro")
print("F1(train):", train_f1)
if train_f1 - test_f1 > 0.1:
    print("⚠ Potensi overfitting: Gap F1 train-test > 0.1. Pertimbangkan tuning lebih ketat.")

cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix (test):\n", cm)

# simpan CM sebagai gambar
def plot_cm(cm, classes, title, filename):
    fig, ax = plt.subplots(figsize=(6, 4))
    im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(len(classes)), yticks=np.arange(len(classes)),
           xticklabels=classes, yticklabels=classes,
           xlabel='Predicted', ylabel='True', title=title)
    thr = cm.max()/2 if cm.max() > 0 else 0.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha='center', va='center',
                    color='white' if cm[i, j] > thr else 'black')
    plt.tight_layout()
    plt.savefig(filename, dpi=120, bbox_inches='tight')
    plt.close()
    print(f"💾 Saved: {filename}")

plot_cm(cm, classes=["Tidak Lulus (0)", "Lulus (1)"], title="Confusion Matrix (test)", filename="cm_test.png")

# ROC + PR curve (sekarang test 2 kelas → harusnya OK)
try:
    y_test_proba = final_model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_test_proba)
    print("ROC-AUC(test):", auc)

    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1],[0,1],'--', label="Random")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (test)"); plt.legend(); plt.grid(True, alpha=0.3)
    plt.tight_layout(); plt.savefig("roc_test.png", dpi=120, bbox_inches='tight'); plt.close()
    print("💾 Saved: roc_test.png")

    prec, rec, _ = precision_recall_curve(y_test, y_test_proba)
    plt.figure(figsize=(6, 5))
    plt.plot(rec, prec, label="PR Curve")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall Curve (test)")
    plt.grid(True, alpha=0.3); plt.tight_layout()
    plt.savefig("pr_test.png", dpi=120, bbox_inches='tight'); plt.close()
    print("💾 Saved: pr_test.png")
except AttributeError:
    print("⚠ Model tidak punya predict_proba → ROC/PR di-skip.")

# ---------- Langkah 6: Feature Importance ----------
try:
    importances = final_model.named_steps["clf"].feature_importances_
    feat_names = final_model.named_steps["pre"].get_feature_names_out()
    # Bersihkan prefix "num__" atau "cat__" dengan aman
    feat_names_clean = []
    for name in feat_names:
        if name.startswith("num__"):
            feat_names_clean.append(name.replace("num__", ""))
        elif name.startswith("cat__"):
            feat_names_clean.append(name.replace("cat__", "").replace("__", "_"))  # Handle OHE names
        else:
            feat_names_clean.append(name)
    
    top = sorted(zip(feat_names_clean, importances), key=lambda x: x[1], reverse=True)
    print("\nTop 10 Feature Importance:")
    for name, val in top[:10]:
        print(f"  {name}: {val:.4f}")
except Exception as e:
    print("⚠ Feature importance tidak tersedia:", str(e))

# ---------- Langkah 7: Simpan Model ----------
joblib.dump(final_model, "rf_model.pkl")
print("\n💾 Model disimpan sebagai rf_model.pkl")

# ---------- Langkah 8: Cek Inference Lokal ----------
sample_data = {
    "IPK": 3.4,
    "Jumlah_Absensi": 4,
    "Waktu_Belajar_Jam": 7,
    "Rasio_Absensi": 4/14,
    "IPK_x_Study": 3.4*7
}
sample = pd.DataFrame([sample_data])

# Check kolom match
expected_cols = X_train.columns.tolist()
sample_cols = sample.columns.tolist()
if set(sample_cols) != set(expected_cols):
    print("⚠ Warning: Sample kolom tidak match dengan train. Menyesuaikan...")
    sample = sample.reindex(columns=expected_cols, fill_value=0)  # Fill missing dengan 0

pred = int(final_model.predict(sample)[0])
try:
    proba = float(final_model.predict_proba(sample)[:,1][0])
    print(f"Contoh prediksi sample → {pred} (Lulus: {'Ya' if pred==1 else 'Tidak'}) | proba lulus: {proba:.3f}")
except AttributeError:
    proba = None
    print(f"Contoh prediksi sample → {pred} (Lulus: {'Ya' if pred==1 else 'Tidak'}) | proba: Tidak tersedia")

# Opsional: Tampilkan plot jika di Jupyter/IDE (comment jika CLI)
# plt.show()

print("\n✅ Kode selesai! Jalankan dengan data real untuk hasil akurat.")


✅ Loaded processed_kelulusan.csv (10 rows).
✅ random_state kedua = 0 → test set mengandung 0 & 1.
💾 Seed disimpan ke seed.txt.
Shapes: (7, 5) (1, 5) (2, 5)
Label count — train:
 Lulus
1    4
0    3
Name: count, dtype: int64
Label count — val:
 Lulus
0    1
Name: count, dtype: int64
Label count — test:
 Lulus
1    1
0    1
Name: count, dtype: int64

Baseline RF — F1(val): 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         1

    accuracy                          1.000         1
   macro avg      1.000     1.000     1.000         1
weighted avg      1.000     1.000     1.000         1

CV F1-macro (train): 1.0 ± 0.0
Fitting 2 folds for each of 36 candidates, totalling 72 fits
Best params: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}
Best RF — F1(val): 1.0

=== TEST EVALUATION ===
F1(test): 1.0
              precision    recall  f1-score   support

           0      1.000     1.000     1.000  