In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import graphviz
import os


In [None]:
# Lấy dữ liệu Heart Disease từ UCI Machine Learning Repository
heart_disease = fetch_ucirepo(id=45)  # 45 là ID của Heart Disease dataset

# Đọc features và labels
X = heart_disease.data.features
y = heart_disease.data.targets['num']  # thường nhãn là cột "num" (0: không bệnh, >0: có bệnh)

# Nếu muốn phân lớp nhị phân: 0 (không bệnh) và 1 (có bệnh)
y = y.apply(lambda x: 1 if x > 0 else 0)

# Kiểm tra
print(X.shape)
print(y.value_counts())

In [None]:
ratios = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
split_datasets = []

for train_ratio, test_ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_ratio,
        stratify=y,
        random_state=42,
        shuffle=True
    )
    split_datasets.append((train_ratio, X_train, X_test, y_train, y_test))


In [None]:
for i, (ratio, X_train, X_test, y_train, y_test) in enumerate(split_datasets):
    # vẽ phân bố class
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    sns.countplot(x=y_train, ax=axes[0])
    axes[0].set_title(f"Train set ({round(ratio*100)}%)")

    sns.countplot(x=y_test, ax=axes[1])
    axes[1].set_title(f"Test set ({round((1-ratio)*100)}%)")

    plt.suptitle(f"Class distribution for {round(ratio*100)}/{round((1-ratio)*100)} split", fontsize=14)
    plt.tight_layout()
    plt.show()

    # đánh giá mô hình và vẽ confusion matrix
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(f"\n=== Split ratio {round(ratio*100)}/{round((1 - ratio)*100)} ===")
    print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Disease', 'Disease'])

    fig, ax = plt.subplots(figsize=(5, 4))
    disp.plot(ax=ax, cmap='Blues', values_format='d')
    ax.set_title(f'Confusion Matrix (Depth={clf.get_depth()}, {round(ratio*100)}/{round((1 - ratio)*100)} Split)')
    plt.show()

In [None]:
def draw_decision_tree(clf, feature_names, filename):
    dot_data = export_graphviz(
        clf,
        out_file=None,
        feature_names=feature_names,
        class_names=["No Disease", "Has Disease"],
        filled=True,
        rounded=True,
        special_characters=True
    )
    graph = graphviz.Source(dot_data)
    graph.render(filename, format='png', cleanup=True)
    return graph

# Huấn luyện và vẽ cây cho mỗi tỉ lệ
for ratio, X_train, X_test, y_train, y_test in split_datasets:
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(X_train, y_train)

    # Vẽ cây
    filename = f"tree_{round(ratio*100)}_{round((1-ratio)*100)}"
    draw_decision_tree(clf, X.columns, filename)

    # Đánh giá
    y_pred = clf.predict(X_test)
    print(f"\n=== Split {round(ratio*100)}/{round((1-ratio)*100)} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [None]:
max_depths = [None, 2, 3, 4, 5, 6, 7]
accuracies = []

# chọn bộ 80/20 trong split_datasets
for ratio, X_train, X_test, y_train, y_test in split_datasets:
    if round(ratio, 1) == 0.8:
        break
    
for depth in max_depths:
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=depth, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    
# Vẽ biểu đồ
plt.figure(figsize=(8, 5))
depth_labels = ['None' if d is None else str(d) for d in max_depths]
plt.plot(depth_labels, accuracies, marker='o')
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.title("Decision Tree Accuracy vs Max Depth (80/20 Split)")
plt.grid(True)
plt.show()