In [None]:
#HEPATITIS DATA ANALYSIS & CLASSIFICATION PIPELINE
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve, accuracy_score,
    precision_score, recall_score, f1_score
)
from joblib import dump

Import seluruh library yang digunakan untuk analisis, preprocessing, dan evaluasi model.

In [None]:
#KONFIGURASI DASAR
RANDOM_STATE = 42
DATA_PATH = Path("/content/hepatitis.csv")
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TEST_SIZE = 0.2
SELECT_K = 8
CV_FOLDS = 5
PLOT_DPI = 150

Menentukan parameter dasar seperti path data, proporsi test, jumlah fitur yang dipilih (SelectKBest), jumlah fold CV, dan resolusi plot.

In [None]:
#FUNGSI BANTUAN
def safe_read_csv(path):
    if not Path(path).exists():
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

def ensure_numeric_boolean_columns(df):
    for c in df.columns:
        if df[c].dtype == object:
            try:
                df[c] = pd.to_numeric(df[c].replace("?", np.nan))
            except Exception:
                pass
    return df

def plot_and_save_confusion_matrix(cm, labels, title, fname):
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(fname, dpi=PLOT_DPI)
    plt.close()

def plot_and_save_roc(y_true, y_score, title, fname):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc = roc_auc_score(y_true, y_score)
    plt.figure(figsize=(5,4))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
    plt.plot([0,1], [0,1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(fname, dpi=PLOT_DPI)
    plt.close()
    return auc

def plot_and_save_learning_curve(estimator, X, y, title, fname, cv=3):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv,
        train_sizes=np.linspace(0.1,1.0,5),
        scoring="accuracy"
    )
    plt.figure(figsize=(6,4))
    plt.plot(train_sizes, train_scores.mean(axis=1), label="Train")
    plt.plot(train_sizes, test_scores.mean(axis=1), label="CV")
    plt.xlabel("Training Size")
    plt.ylabel("Accuracy")
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(fname, dpi=PLOT_DPI)
    plt.close()


Fungsi-fungsi pendukung untuk membaca data, memastikan kolom numerik benar, serta membuat dan menyimpan plot seperti Confusion Matrix, ROC Curve, dan Learning Curve.

In [None]:
#EDA (Exploratory Data Analysis)
df = safe_read_csv(DATA_PATH)
df = ensure_numeric_boolean_columns(df)

print("Dataset Loaded:", DATA_PATH)
print("Shape:", df.shape)
print("\n--- Info Dataset ---")
print(df.info())
print("\n--- Contoh Data ---")
print(df.head())
print("\n--- Nilai Hilang ---")
print(df.isnull().sum())
print("\n--- Statistik Deskriptif ---")
print(df.describe())

Melakukan analisis awal terhadap data: melihat ukuran dataset, tipe data, jumlah nilai hilang, dan statistik dasar (min, max, mean, std).

In [None]:
#MENENTUKAN KOLOM TARGET
target_candidates = [c for c in df.columns if c.lower() in ("class","target"
,"status","result","outcome","death","label","y")]
target_col = target_candidates[0] if target_candidates else df.columns[-1]
print("\n Kolom Target:", target_col)

X = df.drop(columns=[target_col])
y = df[target_col]

# Jika target masih string, ubah ke numerik
label_encoder = None
if y.dtype == object or y.dtype.name == "category":
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y.astype(str))

print("Distribusi Target:\n", pd.Series(y).value_counts())

Menentukan kolom mana yang menjadi variabel target (label kelas). Jika target
bertipe string, diubah menjadi numerik menggunakan LabelEncoder.

In [None]:
#PEMBAGIAN DATA LATIH & UJI
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print(f"\n Train: {X_train.shape}, Test: {X_test.shape}")


Membagi dataset menjadi data latih dan uji dengan rasio 80:20.
Stratifikasi digunakan agar proporsi kelas target tetap seimbang.

In [None]:
#PREPROCESSING & FEATURE ENGINEERING
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in numeric_cols]

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numeric_cols),
    ("cat", cat_transformer, cat_cols)
])


Mempersiapkan data sebelum modeling:

Imputasi nilai hilang (median untuk numerik, modus untuk kategorikal).

Scaling fitur numerik dengan StandardScaler.

Encoding fitur kategorikal dengan OneHotEncoder.
Semua digabung menggunakan ColumnTransformer.

In [None]:
#MODEL KLASIFIKASI YANG DIGUNAKAN
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "SVC": SVC(probability=True, gamma="scale", random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=RANDOM_STATE)
}


Menentukan empat algoritma klasifikasi yang akan digunakan untuk perbandingan performa:
Logistic Regression, Random Forest, Support Vector Machine, dan Gradient Boosting

In [None]:
#TRAINING & EVALUASI MODEL
results = []

for name, clf in models.items():
    print(f"\n===Training {name} ===")

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("selector", SelectKBest(score_func=f_classif, k=min(SELECT_K, X.shape[1]))),
        ("clf", clf)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    try:
        y_score = pipeline.predict_proba(X_test)[:, 1]
    except:
        y_score = None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_score) if y_score is not None else np.nan
    cm = confusion_matrix(y_test, y_pred)

    print(f"Akurasi: {acc:.3f}, Presisi: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, ROC-AUC: {roc:.3f}")
    print("\nConfusion Matrix:\n", cm)

    plot_and_save_confusion_matrix(cm, ["Negatif","Positif"], f"{name} - CM", OUTPUT_DIR / f"cm_{name}.png")
    if y_score is not None:
        plot_and_save_roc(y_test, y_score, f"{name} - ROC", OUTPUT_DIR / f"roc_{name}.png")
    plot_and_save_learning_curve(pipeline, X_train, y_train, f"{name} - Learning Curve", OUTPUT_DIR / f"lc_{name}.png")

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC-AUC": roc
    })


Melatih tiap model dalam pipeline penuh (preprocessing → seleksi fitur → model).
Setelah dilatih, dilakukan evaluasi menggunakan: Accuracy, Precision, Recall, F1-score, dan ROC-AUC.
Hasilnya divisualisasikan dalam bentuk Confusion Matrix, ROC Curve, dan Learning Curve.

In [None]:
#PERBANDINGAN HASIL MODEL
results_df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
print("\n Ringkasan Hasil Model:")
print(results_df)

results_df.to_csv(OUTPUT_DIR / "model_summary.csv", index=False)
print("\n Disimpan ke:", OUTPUT_DIR / "model_summary.csv")


Membuat tabel perbandingan performa keempat model dan menyimpannya ke file model_summary.csv.

In [None]:
#CROSS-VALIDATION UNTUK VALIDASI STABILITAS MODEL
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []

for name, clf in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("selector", SelectKBest(score_func=f_classif, k=min(SELECT_K, X.shape[1]))),
        ("clf", clf)
    ])
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring="accuracy")
    cv_scores.append({"Model": name, "CV Mean": scores.mean(), "CV Std": scores.std()})

cv_df = pd.DataFrame(cv_scores)
cv_df.to_csv(OUTPUT_DIR / "cv_summary.csv", index=False)
print("\n Cross Validation Summary:\n", cv_df)


Melakukan validasi silang (StratifiedKFold) untuk menilai kestabilan akurasi model di berbagai subset data.
Hasilnya disimpan di cv_summary.csv.