In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

In [2]:
# Load data
X_train = pd.read_csv("../output/X_train.csv")
X_test = pd.read_csv("../output/X_test.csv")
y_train = pd.read_csv("../output/y_train.csv").squeeze()
y_test = pd.read_csv("../output/y_test.csv").squeeze()

print("Data loaded successfully")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Distribusi y_train:")
print(y_train.value_counts(normalize=True))

Data loaded successfully
Train shape: (3066, 29), Test shape: (767, 29)
Distribusi y_train:
Churn
0    0.804305
1    0.195695
Name: proportion, dtype: float64


In [3]:
# Definisikan pipeline tiap model
logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=5000, random_state=42))
])

svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(probability=True, class_weight='balanced', random_state=42))
])

knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),          # PCA valid
    ("model", KNeighborsClassifier())
])

rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
lgbm_model = LGBMClassifier(random_state=42)

In [4]:
# Param grids 
param_grids = {
    "Logistic Regression": {
        "model__C": [0.5, 1, 2],
        "model__solver": ["lbfgs"],
        "model__penalty": ["l2"]
    },

    "SVM": {
        "model__C": [0.1, 0.5, 1],
        "model__kernel": ["rbf", "linear"],
        "model__gamma": ["scale"]
    },

    "KNN": {
        "model__n_neighbors": [7, 11, 15, 21],
        "model__weights": ["uniform"],
        "pca__n_components": [5, 7, 10]
    },

    "Random Forest": {
        "n_estimators": [200, 400],
        "max_depth": [4, 6, 8],
        "min_samples_split": [20, 40, 60],
        "min_samples_leaf": [8, 12, 16],   # PENTING
        "max_features": ["sqrt", "log2"],
        "bootstrap": [True],
        "max_samples": [0.6, 0.8]          # NEW: bagging lebih kecil → lebih general
    },

    "XGBoost": {
        "n_estimators": [100],
        "max_depth": [3, 4],
        "learning_rate": [0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.8]
    },

    "LightGBM": {
        "max_depth": [4, 6],        # shallow saja – mencegah overfit
        "num_leaves": [8, 16],      # selalu ≤ 2^(max_depth)
        "min_child_samples": [20, 40],   # regularisasi paling kuat
        "learning_rate": [0.03],    # tetap stabil
        "n_estimators": [200],      # cukup, tidak berlebihan
        "subsample": [0.7],         # bagging moderate
        "colsample_bytree": [0.7],  # feature sampling moderate
        "reg_lambda": [1],          # L2 penalty untuk stabilisasi
    }


}

In [5]:
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def tune_model_once(model, name, X_train, y_train):
    if name not in param_grids:
        print(f"Tidak ada param_grid untuk {name}, pakai default model.")
        return model

    print(f"\n Tuning {name} ...")
    param_grid = param_grids[name]

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1',
        cv=3,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)
    print(f"✅ Best params for {name}: {grid_search.best_params_}")

    return grid_search.best_estimator_


In [6]:
# Evaluasi dengan CV 5-Fold
def evaluate_model_cv(model, name, X, y):
    acc_scores, prec_scores, rec_scores, f1_scores, roc_scores = [], [], [], [], []

    for train_idx, test_idx in cv_outer.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_test_fold)

        # ROC AUC
        if hasattr(model, "predict_proba"):
            y_score = model.predict_proba(X_test_fold)[:, 1]
        elif hasattr(model, "decision_function"):
            y_score = model.decision_function(X_test_fold)
        else:
            y_score = y_pred

        acc_scores.append(accuracy_score(y_test_fold, y_pred))
        prec_scores.append(precision_score(y_test_fold, y_pred))
        rec_scores.append(recall_score(y_test_fold, y_pred))
        f1_scores.append(f1_score(y_test_fold, y_pred))
        roc_scores.append(roc_auc_score(y_test_fold, y_score))

    metrics = {
        "Model": name,
        "Accuracy": np.mean(acc_scores),
        "Precision": np.mean(prec_scores),
        "Recall": np.mean(rec_scores),
        "F1 Score": np.mean(f1_scores),
        "ROC AUC": np.mean(roc_scores)
    }

    print(f"\n=== {name} (CV 5-Fold) ===")
    for k, v in metrics.items():
        if k != "Model":
            print(f"{k}: {v:.4f}")

    return metrics

In [7]:
# Jalankan tuning
models = {
    "Logistic Regression": logreg_pipeline,
    "Random Forest": rf_model,
    "KNN": knn_pipeline,
    "SVM": svm_pipeline,
    "XGBoost": xgb_model,
    "LightGBM": lgbm_model
}
tuned_models = []

for name, model in models.items():
    best_model = tune_model_once(model, name, X_train, y_train)
    tuned_models.append((best_model, name))


 Tuning Logistic Regression ...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
✅ Best params for Logistic Regression: {'model__C': 1, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}

 Tuning Random Forest ...
Fitting 3 folds for each of 216 candidates, totalling 648 fits
✅ Best params for Random Forest: {'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'max_samples': 0.8, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 400}

 Tuning KNN ...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
✅ Best params for KNN: {'model__n_neighbors': 7, 'model__weights': 'uniform', 'pca__n_components': 10}

 Tuning SVM ...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
✅ Best params for SVM: {'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}

 Tuning XGBoost ...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best params for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}

 Tuning LightGBM ...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 600, number of negative: 2466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 3066, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195695 -> initscore=-1.413423
[LightGBM] [Info] Start training from score -1.413423
✅ Best params for LightGBM: {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 16, 'reg_lambda': 1, 'subsample': 0.7}


In [8]:
# Evaluasi hasil CV
results_cv = []
for model, name in tuned_models:
    result = evaluate_model_cv(model, name, X_train, y_train)
    results_cv.append(result)

results_cv_df = pd.DataFrame(results_cv).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print("\n Hasil Akhir Setelah Hyperparameter Tuning:")
print(results_cv_df)

joblib.dump(tuned_models[0][0], "best_model.pkl")
print("\n Model terbaik berhasil disimpan")


=== Logistic Regression (CV 5-Fold) ===
Accuracy: 0.8764
Precision: 0.7510
Recall: 0.5550
F1 Score: 0.6361
ROC AUC: 0.8928

=== Random Forest (CV 5-Fold) ===
Accuracy: 0.8780
Precision: 0.8536
Recall: 0.4550
F1 Score: 0.5926
ROC AUC: 0.9323

=== KNN (CV 5-Fold) ===
Accuracy: 0.8288
Precision: 0.6020
Recall: 0.3633
F1 Score: 0.4530
ROC AUC: 0.7987

=== SVM (CV 5-Fold) ===
Accuracy: 0.8832
Precision: 0.6521
Recall: 0.8700
F1 Score: 0.7448
ROC AUC: 0.9441


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost (CV 5-Fold) ===
Accuracy: 0.9041
Precision: 0.8185
Recall: 0.6550
F1 Score: 0.7273
ROC AUC: 0.9418
[LightGBM] [Info] Number of positive: 480, number of negative: 1972
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 2452, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195759 -> initscore=-1.413017
[LightGBM] [Info] Start training from score -1.413017
[LightGBM] [Info] Number of positive: 480, number of negative: 1973
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 2453, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: p

In [9]:
# Check overfitting
# =============================
def check_overfitting(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train)
    test_f1 = f1_score(y_test, y_pred_test)

    # Hitung ukuran data
    n_train = len(X_train)
    n_test = len(X_test)
    print(f"Jumlah data: train={n_train}, test={n_test}")

    accuracy_threshold = 0.07
    f1_threshold = 0.12

    # Tampilkan metrik
    print(f"Train Accuracy: {train_acc:.4f} | F1: {train_f1:.4f}")
    print(f"Test Accuracy:  {test_acc:.4f} | F1: {test_f1:.4f}")
    print(f"Selisih Accuracy: {abs(train_acc - test_acc):.4f}")
    print(f"Selisih F1:       {abs(train_f1 - test_f1):.4f}")

    # Evaluasi overfitting
    if abs(train_acc - test_acc) > accuracy_threshold or abs(train_f1 - test_f1) > f1_threshold:
        print("⚠️  Model kemungkinan overfitting (di atas ambang toleransi)!")
    else:
        print("Tidak terdeteksi overfitting signifikan.")


for model, name in tuned_models:
    print(f"\n=== Check Overfitting: {name} ===")
    model.fit(X_train, y_train)
    check_overfitting(model, X_train, X_test, y_train, y_test)



=== Check Overfitting: Logistic Regression ===
Jumlah data: train=3066, test=767
Train Accuracy: 0.8819 | F1: 0.6499
Test Accuracy:  0.8774 | F1: 0.6179
Selisih Accuracy: 0.0045
Selisih F1:       0.0320
Tidak terdeteksi overfitting signifikan.

=== Check Overfitting: Random Forest ===
Jumlah data: train=3066, test=767
Train Accuracy: 0.9136 | F1: 0.7282
Test Accuracy:  0.8944 | F1: 0.6368
Selisih Accuracy: 0.0192
Selisih F1:       0.0914
Tidak terdeteksi overfitting signifikan.

=== Check Overfitting: KNN ===
Jumlah data: train=3066, test=767
Train Accuracy: 0.8751 | F1: 0.6112
Test Accuracy:  0.8422 | F1: 0.4762
Selisih Accuracy: 0.0328
Selisih F1:       0.1350
⚠️  Model kemungkinan overfitting (di atas ambang toleransi)!

=== Check Overfitting: SVM ===
Jumlah data: train=3066, test=767
Train Accuracy: 0.9220 | F1: 0.8299
Test Accuracy:  0.8918 | F1: 0.7566
Selisih Accuracy: 0.0303
Selisih F1:       0.0733
Tidak terdeteksi overfitting signifikan.

=== Check Overfitting: XGBoost ===
J

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
