<div align="center">

# 🌳 Project 2: Decision Tree Classifier


## 👥 Thành viên nhóm

| 🆔 MSSV    | 👨‍🎓 Họ và Tên           |
|-----------|--------------------------|
| 22120194  | Nguyễn Nhật Long         |
| 22120197  | Nguyễn Vĩnh Lương        |
| 22120238  | Nguyễn Minh Nguyên       |
| 22120252  | Giang Đức Nhật           |

## ♥️ Dataset 01: Heart Disease
</div>

## **0. Chuẩn bị thư viện**

### **0.1 Cài các packages cần thiết**

In [None]:
%pip install -q matplotlib
%pip install -q scikit-learn
%pip install -q pandas
%pip install -q numpy
%pip install -q graphviz
%pip install -q ucimlrepo
%pip install -q seaborn
%pip install -q category_encoders

### **0.2 Import các thư viện cần thiết**

In [52]:
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from ucimlrepo import fetch_ucirepo
from graphviz import Source
from IPython.display import display, Image
import seaborn as sns
import os
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

### **0.3 Hàm tiền xử lý**

In [53]:
def preprocess_data(dataset, nominal_cols=None, ordinal_cols=None, numerical_cols=None):
    # Drop duplicates
    dataset.data.original.drop_duplicates(inplace=True)

    # Solve missing values
    df = dataset.data.original # features + target
    var_info = dataset.variables # name, role, type, demographic, description, units, missing_values

    for col in df.columns:
        # Find corresponding variable information
        var_row = var_info[var_info['name'] == col]
        if not var_row.empty and var_row.iloc[0]['missing_values'] == 'yes':
            if nominal_cols and col in nominal_cols:
                # Most frequent cho nominal
                most_frequent = df[col].mode(dropna=True)[0]
                df[col].fillna(most_frequent, inplace=True)
            elif ordinal_cols and col in ordinal_cols:
                # Most frequent cho ordinal
                most_frequent = df[col].mode(dropna=True)[0]
                df[col].fillna(most_frequent, inplace=True)
            elif numerical_cols and col in numerical_cols:
                # Median cho numeric
                median = df[col].median(skipna=True)
                df[col].fillna(median, inplace=True)

    # One-Hot Encoding cho Nominal
    if nominal_cols:
        ohe = OneHotEncoder(handle_unknown='ignore')
        ohe_arr = ohe.fit_transform(df[nominal_cols]).toarray()
        ohe_df = pd.DataFrame(ohe_arr, columns=ohe.get_feature_names_out(nominal_cols), index=df.index)
        df = df.drop(columns=nominal_cols)
        df = pd.concat([df, ohe_df], axis=1)

    # Ordinal Encoding cho Ordinal
    if ordinal_cols:
        ord_enc = OrdinalEncoder()
        df[ordinal_cols] = ord_enc.fit_transform(df[ordinal_cols])

    # Return processed DataFrame
    return df


## **1. Binary class dataset: The UCI Heart Disease dataset**

### **1.1. Chuẩn bị Dataset**

#### *1.1.1 Tiền xử lý dữ liệu*

In [None]:
heart_disease = fetch_ucirepo(id=45)
target_col_name = heart_disease.metadata.target_col

# If target_col_name is a list, take the first element
if isinstance(target_col_name, (list, tuple)) and len(target_col_name) == 1:
    target_col_name = target_col_name[0]

nominal_cols = ['sex', 'fbs', 'exang', 'thal']
ordinal_cols = ['cp', 'restecg', 'slope']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

heart_disease_processed = preprocess_data(heart_disease, nominal_cols, ordinal_cols, numerical_cols)

labels = heart_disease_processed[target_col_name]
# Fix target dataset
labels = labels.apply(lambda x: 0 if x == 0 else 1)

features = heart_disease_processed.drop(columns=target_col_name)

print(features)

print(labels)



#### *1.1.2 Chuẩn bị train và test dataset*

In [55]:
# Define train/test split proportions
split_ratios = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]

subsets = []

for split_ratio in split_ratios:
    feature_train, feature_test, label_train, label_test = train_test_split(
        features, labels, test_size=split_ratio[1], random_state=45, stratify = labels
    )
    
    subsets.append({
        'feature_train': feature_train,
        'label_train': label_train,
        'feature_test': feature_test,
        'label_test': label_test
    })


#### *1.1.3 Visualization*

##### a. Phân phối của dataset gốc

In [None]:
# Create directory to save distribution plots
os.makedirs("./Dataset 01/Distribution", exist_ok=True)

# Visualize the class distribution in the original dataset
plt.figure(figsize=(10, 5))
bins = np.arange(len(np.unique(labels)) + 1) - 0.5
plt.hist(labels, bins=bins, color="green", alpha=0.7, edgecolor="black")
plt.title("Class Distribution for Original Dataset")
plt.xlabel("Classes")
plt.ylabel("Frequency")
plt.xticks(np.arange(len(np.unique(labels))), np.unique(labels))

# Save the plots
plt.savefig("./Dataset 01/Distribution/Original Distribution.png", format='png', dpi=300)
plt.show()

##### b. Phân phối của tập train/test theo mỗi tỉ lệ chia:

In [None]:
# Create a single figure with subplots arranged in 4 rows x 2 columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(14, 16))
fig.suptitle("Class Distribution for Each Train/Test Split", fontsize=16)

# Get unique class labels and create bins
class_labels = np.unique(labels)
bins = np.arange(len(class_labels) + 1) - 0.5 

# Visualize distributions for each train/test split
for i in range(len(subsets)):
    label_train = subsets[i]['label_train']
    label_test = subsets[i]['label_test']
    train_ratio = int(split_ratios[i][0] * 100)
    test_ratio = int(split_ratios[i][1] * 100)

    # Train distribution (left column)
    axes[i, 0].hist(label_train, bins=bins, color="red", alpha=0.7, edgecolor="black")
    axes[i, 0].set_title(f"Train {train_ratio}%")
    axes[i, 0].set_xticks(class_labels)
    axes[i, 0].set_xlabel("Classes")
    axes[i, 0].set_ylabel("Frequency")
    
    # Add count annotations on bars
    counts_train, _ = np.histogram(label_train, bins=bins)
    max_count_train = max(counts_train) if len(counts_train) > 0 else 0
    
    # Set y-axis limit with extra space for labels
    axes[i, 0].set_ylim(0, max_count_train * 1.15)  # 15% extra space
    
    for j, count in enumerate(counts_train):
        if count > 0: 
            axes[i, 0].text(class_labels[j], count + max_count_train * 0.02, 
                           str(count), ha='center', va='bottom', fontsize=10)

    # Test distribution (right column)
    axes[i, 1].hist(label_test, bins=bins, color="orange", alpha=0.7, edgecolor="black")
    axes[i, 1].set_title(f"Test {test_ratio}%")
    axes[i, 1].set_xticks(class_labels)
    axes[i, 1].set_xlabel("Classes")
    axes[i, 1].set_ylabel("Frequency")
    
    # Add count annotations on bars
    counts_test, _ = np.histogram(label_test, bins=bins)
    max_count_test = max(counts_test) if len(counts_test) > 0 else 0
    
    # Set y-axis limit with extra space for labels
    axes[i, 1].set_ylim(0, max_count_test * 1.15)  # 15% extra space
    
    for j, count in enumerate(counts_test):
        if count > 0:
            axes[i, 1].text(class_labels[j], count + max_count_test * 0.02, 
                           str(count), ha='center', va='bottom', fontsize=10)

# Adjust layout to prevent overlapping
plt.tight_layout(rect=[0, 0, 1, 0.96])

# Save the combined plot
plt.savefig("./Dataset 01/Distribution/Test and Train Distribution.png", format='png', dpi=300, bbox_inches='tight')
plt.show()

### 1.2 Xây dựng Decision Tree

#### 1.2.1 Train model

In [58]:
# Train and evaluate the Decision Tree model using Entropy (Information Gain)
models = []
for i, subset in enumerate(subsets):
    feature_train = subset['feature_train']
    label_train = subset['label_train']
    model = DecisionTreeClassifier(criterion='entropy', random_state=45)
    model.fit(feature_train, label_train)
    models.append(model)

#### 1.2.2 Visualization

In [None]:
# Create directory to save the decision tree images
os.makedirs("./Dataset 01/Decision Tree", exist_ok=True)

# Create and visualize decision trees for each model
for i in range(len(models)):
    ratio_name = f"{int(split_ratios[i][0]*100)}_{int(split_ratios[i][1]*100)}"

    print(f"Decision tree trained with a train/test split of {ratio_name}")

    # Get the model and feature names
    model = models[i]
    feature_names = features.columns.tolist()

    dot_data = export_graphviz(
        model,
        out_file=None,
        feature_names=feature_names,
        class_names=['No Disease', 'Has Disease'],
        filled=True,
        rounded=True,
        special_characters=True
    )

    # Visualize the decision tree
    graph = Source(dot_data)
    file_path = f"./Dataset 01/Decision Tree/Tree_{ratio_name}"
    graph.render(file_path, format='png', cleanup=True)
    display(Image(file_path + ".png"))
    
    if(i != len(subsets) - 1):
        print('-' * 100)

### 1.3 Đánh giá Decision Tree

In [None]:
# Create directory to save classification and confusion matrices
os.makedirs("./Dataset 01/Evaluation", exist_ok=True)


# For each model and split, make predictions, generate a report, and confusion matrix
for i, subset in enumerate(subsets):
    feature_train = subset['feature_train']
    label_train = subset['label_train']
    feature_test = subset['feature_test']
    label_test = subset['label_test']
    
    # Make predictions
    label_pred = models[i].predict(feature_test)
    
    # Generate classification report
    ratio_name = f"{int(split_ratios[i][0]*100)}_{int(split_ratios[i][1]*100)}"

    print(f"\nClassification Report for {ratio_name} Split:")
    report_text = classification_report(label_test, label_pred, target_names=['No Disease', 'Has Disease'])
    print(report_text)

    # Save classification report to a text file
    with open(f"./Dataset 01/Evaluation/Report_{ratio_name}.txt", "w") as f:
        f.write(report_text)

    # Generate confusion matrix
    cm = confusion_matrix(label_test, label_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=['No Disease', 'Has Disease'],
                yticklabels=['No Disease', 'Has Disease'])
    plt.title(f"Confusion Matrix for {ratio_name} Split")
    plt.xlabel('Predicted label')
    plt.ylabel('True label')

     
    # Save confusion matrix plot
    plt.savefig(f"./Dataset 01/Evaluation/Confusion_{ratio_name}.png", format='png',dpi=300, bbox_inches='tight')
    plt.show()
    if(i != len(subsets) - 1):
        print('-' * 100)

### 1.4. Độ sâu (depth) và độ chính xác (accuracy) của Decision Tree

In [None]:
#Create directory for save decision tree images with max depth
os.makedirs("./Dataset 01/Depth and Accuracy", exist_ok=True)

# Train and evaluate Decision Tree models with varying max depths on the 80/20 split
subset_80_20 = subsets[2]
feature_train_80_20 = subsets[2]['feature_train']
label_train_80_20 = subset_80_20['label_train']
feature_test_80_20 = subset_80_20['feature_test']
label_test_80_20 = subset_80_20['label_test']
accuracy_scores = []
depths = [None, 2, 3, 4, 5, 6, 7]

for depth in depths:
    model = DecisionTreeClassifier(criterion='entropy', random_state=42, max_depth=depth)
    model.fit(feature_train_80_20, label_train_80_20)
    feature_names = features.columns.astype(str).tolist() 
    print(f"Decision tree trained with an 80/20 split and max depth of {depth}")
    

    dot_data = export_graphviz(
        model,
        out_file=None,
        feature_names=feature_names,
        class_names=['No Disease', 'Has Disease'],
        filled=True,
        rounded=True,
        special_characters=True
    )
    
    # Visualize the decision tree
    graph = Source(dot_data)
    file_path = f"./Dataset 01/Depth and Accuracy/Tree_Depth_{depth}"
    graph.render(file_path, format='png', cleanup=True)
    display(Image(file_path + ".png"))

    pred=model.predict(feature_test_80_20)
    accuracy = accuracy_score(label_test_80_20, pred)
    print(f"Accuracy for max depth {depth}: {accuracy}")
    accuracy_scores.append(accuracy)
    print('-' * 100)


# Plot the results
depths_for_plot = [str(d) if d is not None else 'None' for d in depths]
plt.figure(figsize=(10, 5))
plt.plot(depths_for_plot, accuracy_scores, marker='o', color='blue')
plt.title('Accuracy vs Max Depth for Decision Tree')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
y_min = min(accuracy_scores)
y_max = max(accuracy_scores)
y_range = y_max - y_min
padding = y_range * 0.1  # 5% of the value range
plt.ylim(y_min - padding, y_max + padding)
plt.xticks(depths_for_plot)
plt.grid(True)

for i, (x, y) in enumerate(zip(depths_for_plot, accuracy_scores)):
    plt.text(x, y + 0.003, f"{y:.3f}", ha='center', va='bottom')

plt.savefig(f"./Dataset 01/Depth and Accuracy/Chart Statistics.png", format='png',dpi=300)
plt.show()

# Print table
table_data = {
    depth if depth is not None else 'None': [f"{acc:.3f}"]
    for depth, acc in zip(depths_for_plot, accuracy_scores)
}

df = pd.DataFrame(table_data, index=["Accuracy"])
display(df)