In [1]:
!pip install matplotlib pandas scikit-learn seaborn graphviz

Collecting graphviz
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.20.3


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import Image, display
from graphviz import Source


In [None]:
# Load the dataset
url = "data/processed.cleveland.data"
columns = [
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "target",
]

df = pd.read_csv(url, names=columns, na_values="?")

# Drop rows with missing values
df.dropna()
display(df)

# Convert categorical variables to category type
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)
df["target_label"] = df["target"].map({0: "No Heart Disease", 1: "Heart Disease"})

In [None]:
# Feature and label separation
feature = df.drop(["target", "target_label"], axis=1)
label = df["target"]

In [None]:
# Train-test split
order = ["Heart Disease", "No Heart Disease"] 

plt.figure(figsize=(10, 5))
sns.countplot(x=df["target_label"], order=order)
plt.title("Original dataset distribution")
plt.xlabel("Target")
plt.ylabel("Count")
plt.show()

splits = [0.4, 0.6, 0.8, 0.9]
split_results = {}

for train_ratio in splits:
    feature_train, feature_test, label_train, label_test = train_test_split(
        feature, label, train_size=train_ratio, random_state=42, stratify=label, shuffle=True
    )

    split_results[train_ratio] = {
        "feature_train": feature_train,
        "feature_test": feature_test,
        "label_train": label_train,
        "label_test": label_test,
    }

    # Convert labels to readable format for visualization
    label_train_named = label_train.map({0: "No Heart Disease", 1: "Heart Disease"})
    label_test_named = label_test.map({0: "No Heart Disease", 1: "Heart Disease"})

    # Visualize trainning and testing set distribution
    plt.figure(figsize=(10, 5))
    sns.countplot(x=label_train_named, order=order)
    plt.title(f"Training set distribution (Train ratio = {int(train_ratio * 100)}%)")
    plt.xlabel("Target")
    plt.ylabel("Count")
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.countplot(x=label_test_named, order=order)
    plt.title(f"Testing set distribution (Test ratio = {100 - int(train_ratio * 100)}%)")
    plt.xlabel("Target")
    plt.ylabel("Count")
    plt.show()

In [None]:
# Decision Tree Classifier
for train_ratio, data in split_results.items():
    feature_train = data["feature_train"]
    label_train = data["label_train"]
    feature_test = data["feature_test"]
    label_test = data["label_test"]

    # Create and train the model
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
    clf.fit(feature_train, label_train)

    # Make predictions
    label_pred = clf.predict(feature_test)

    # Evaluation
    ## Accuracy
    accuracy = accuracy_score(label_test, label_pred)
    print(f"Accuracy (train_ratio={train_ratio}): {accuracy:.2f}")

    ## Classification report
    report = classification_report(label_test, label_pred)
    print(report)

    ## Confusion matrix
    cm = confusion_matrix(label_test, label_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    disp.plot(cmap= plt.cm.Blues)
    plt.title(f"Confusion Matrix (train_ratio={train_ratio})")
    plt.show()

    # Visualize the decision tree
    ## Save the decision tree as a PNG file
    output_dir = "DecisionTree/HeartDisease"
    os.makedirs(output_dir, exist_ok=True)

    file_name = f"dtree_{int(train_ratio * 100)}"
    file_path = os.path.join(output_dir, file_name)

    ## Title for the graph
    title = f"Heart Disease (Train ratio = {int(train_ratio * 100)}%)"

    ## Create the decision tree graph
    dot_data = export_graphviz(
        clf,
        out_file=None,
        feature_names= feature.columns,
        class_names= ["No Heart Disease", "Heart Disease"],
        filled=True,
        rounded=True,
        special_characters=True
    )

    ## Add title to the graph
    dot_data_with_title = dot_data.replace(
        "digraph Tree {",
        f'digraph Tree {{\ngraph [label="{title}", labelloc=top, fontsize=20];'
    )

    ## Render the graph
    graph = Source(dot_data_with_title)
    graph.render(file_path, format="png", cleanup=True)
    display(Image(filename=f"{file_path}.png"))

In [None]:
# Analyze accuracy vs max_depth
split_80 = split_results[0.8]
feature_train = split_80["feature_train"]
feature_test = split_80["feature_test"]
label_train = split_80["label_train"]
label_test = split_80["label_test"]

max_depths = [None, 2, 3, 4, 5, 6, 7]
depth_results = []

for max_depth in max_depths:
    model = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=42)
    model.fit(feature_train, label_train)

    prediction = model.predict(feature_test)
    accuracy = accuracy_score(label_test, prediction)
    depth_results.append({  
        "max_depth": str(max_depth),
        "accuracy": accuracy
    })

# Plot accuracy vs max_depth
depth_df = pd.DataFrame(depth_results)
depth_df["max_depth_str"] = depth_df["max_depth"].astype(str)
plt.figure(figsize=(10, 6))
sns.lineplot(data=depth_df, x="max_depth_str", y="accuracy", marker="o")
plt.title("Decision Tree Accuracy vs Max Depth")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.grid()
plt.show()