In [3]:
# 0. Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [7]:
# 1. Load dataset
df = pd.read_csv('C:\Users\user\Documents\GitHub\Klasifikasi-Risiko-Penyakit-Jantung-Berdasarkan-Faktor-Kesehatan-Pasien\data\heart.xls')
display(df.shape)
display(df.head())


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1032284168.py, line 2)

In [None]:
# 2. Quick info
display(df.info())
display(df.describe(include='all'))

In [None]:
# 3. Cek missing values
print("Missing values per column:\n", df.isnull().sum())

In [None]:
# 4. Visualisasi distribusi target
plt.figure(figsize=(6,4))
sns.countplot(x='target', data=df)
plt.title('Distribusi Variabel Target (0 = Tidak, 1 = Berisiko)')
plt.xlabel('target')
plt.ylabel('count')
plt.show()

In [None]:
# 5. Exploratory plots: korelasi, beberapa fitur vs target
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()

# Contoh hubungan fitur numerik dengan target
num_features = ['age','trestbps','chol','thalach']
for f in num_features:
    plt.figure(figsize=(6,3))
    sns.boxplot(x='target', y=f, data=df)
    plt.title(f'{f} vs target')
    plt.show()

In [None]:
# 6. Pra-pemrosesan
# a) Tentukan fitur numerik dan kategorikal (sesuaikan dataset)
# contoh umum pada dataset heart:
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'] if 'oldpeak' in df.columns else ['age','trestbps','chol','thalach']
cat_cols = [c for c in df.columns if c not in num_cols + ['target']]

print("Numerical:", num_cols)
print("Categorical:", cat_cols)

# b) Tangani missing (jika ada)
# Strategie sederhana: numeric -> median, categorical -> mode
for c in num_cols:
    if df[c].isnull().sum()>0:
        df[c].fillna(df[c].median(), inplace=True)
for c in cat_cols:
    if df[c].isnull().sum()>0:
        df[c].fillna(df[c].mode()[0], inplace=True)

# c) Encoding: gunakan One-Hot untuk kategorikal nominal
X = df.drop(columns=['target'])
y = df['target']

# Simpel pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols)
])

# d) Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fit preprocessor
preprocessor.fit(X_train)
X_train_p = pd.DataFrame(preprocessor.transform(X_train))
X_test_p = pd.DataFrame(preprocessor.transform(X_test))

print("Transformed shapes:", X_train_p.shape, X_test_p.shape)

In [None]:
# 7. Model A: Decision Tree tanpa batas (overfit possible)
modelA = DecisionTreeClassifier(random_state=42)
modelA.fit(X_train_p, y_train)

# Evaluasi Model A
y_train_predA = modelA.predict(X_train_p)
y_test_predA = modelA.predict(X_test_p)

print("=== Model A (no restrictions) ===")
print("Train accuracy:", accuracy_score(y_train, y_train_predA))
print("Test accuracy:", accuracy_score(y_test, y_test_predA))
print("\nClassification report (test):\n", classification_report(y_test, y_test_predA))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_predA))

In [None]:
# 8. Model B: pruned model (batasi depth)
modelB = DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, random_state=42)
modelB.fit(X_train_p, y_train)

# Evaluasi Model B
y_train_predB = modelB.predict(X_train_p)
y_test_predB = modelB.predict(X_test_p)

print("\n=== Model B (pruned) ===")
print("Train accuracy:", accuracy_score(y_train, y_train_predB))
print("Test accuracy:", accuracy_score(y_test, y_test_predB))
print("\nClassification report (test):\n", classification_report(y_test, y_test_predB))
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_predB))


In [None]:
# 9. Visualisasi pohon Model B
plt.figure(figsize=(20,10))
plot_tree(modelB, filled=True, feature_names=list(preprocessor.get_feature_names_out()), class_names=['No','Yes'], fontsize=10)
plt.show()

In [None]:
# 10. Feature importances (Model B)
importances = modelB.feature_importances_
feat_names = list(preprocessor.get_feature_names_out())
feat_df = pd.DataFrame({'feature': feat_names, 'importance': importances})
feat_df = feat_df.sort_values('importance', ascending=False).head(15)

plt.figure(figsize=(8,6))
sns.barplot(x='importance', y='feature', data=feat_df)
plt.title('Top 15 Feature Importances (Model B)')
plt.show()

In [None]:
# 11. Simpan model terbaik (ambil Model B)
joblib.dump({
    'preprocessor': preprocessor,
    'model': modelB
}, 'dt_heart_pipeline.joblib')

print("Model saved as dt_heart_pipeline.joblib")