In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


In [2]:
def load_data(file_path):
    """
    Load the dataset from the provided file path.

    Args:
        file_path (str): The path to the CSV file containing the dataset.
    
    Returns:
        pd.DataFrame: Loaded dataset as a pandas DataFrame.
    """
    return pd.read_csv(file_path)

# Example usage:
file_path = '/Users/sinan/Library/Mobile Documents/com~apple~CloudDocs/ML_PROJECTS/Python Programming Toolsets for Coding Decision Tree ML Models/heart_disease_uci.csv'
data = load_data(file_path)

# View the first few rows to understand the data structure
data.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
def preprocess_data(data):
    """
    Prepare the dataset for model training by applying one-hot encoding to categorical features.

    Args:
        data (pd.DataFrame): The dataset containing features and target.
    
    Returns:
        X (pd.DataFrame): The feature set with categorical variables encoded.
        y (pd.Series): The target variable.
    """
    # Features (drop the 'num' column as it is the target)
    X = data.drop('num', axis=1)
    
    # Target ('num' column is the target)
    y = data['num']
    
    # Apply one-hot encoding to categorical variables
    X_encoded = pd.get_dummies(X, drop_first=True)
    
    return X_encoded, y

# Example usage:
X, y = preprocess_data(data)

# Display the processed feature set
X.head()


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,sex_Male,dataset_Hungary,dataset_Switzerland,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,63,145.0,233.0,150.0,2.3,0.0,True,False,False,...,False,True,True,False,False,False,False,False,False,False
1,2,67,160.0,286.0,108.0,1.5,3.0,True,False,False,...,False,False,False,False,False,True,True,False,True,False
2,3,67,120.0,229.0,129.0,2.6,2.0,True,False,False,...,False,False,False,False,False,True,True,False,False,True
3,4,37,130.0,250.0,187.0,3.5,0.0,True,False,False,...,True,False,False,True,False,False,False,False,True,False
4,5,41,130.0,204.0,172.0,1.4,0.0,False,False,False,...,False,False,False,False,False,False,False,True,True,False


In [4]:
def split_data(X, y):
    """
    Split the dataset into training and testing sets.

    Args:
        X (pd.DataFrame): The feature set.
        y (pd.Series): The target variable.
    
    Returns:
        X_train, X_test, y_train, y_test: The training and testing datasets.
    """
    return train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Example usage:
X_train, X_test, y_train, y_test = split_data(X, y)


In [5]:
def train_decision_tree(X_train, y_train):
    """
    Initialize and train the Decision Tree Classifier on the training data.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
    
    Returns:
        DecisionTreeClassifier: The trained decision tree model.
    """
    # Initialize Decision Tree Classifier
    tree = DecisionTreeClassifier(random_state=42)
    
    # Train the model using the training dataset
    tree.fit(X_train, y_train)
    
    return tree

# Example usage:
tree = train_decision_tree(X_train, y_train)


In [6]:
def evaluate_model(tree, X_train, y_train, X_test, y_test):
    """
    Evaluate the accuracy of the decision tree model on both training and testing datasets.

    Args:
        tree (DecisionTreeClassifier): The trained decision tree model.
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Test features.
        y_test (pd.Series): Test target.
    
    Returns:
        None
    """
    # Accuracy on training set
    train_accuracy = tree.score(X_train, y_train)
    print(f'Accuracy on training set: {train_accuracy:.3f}')
    
    # Accuracy on test set
    test_accuracy = tree.score(X_test, y_test)
    print(f'Accuracy on test set: {test_accuracy:.3f}')

# Example usage:
evaluate_model(tree, X_train, y_train, X_test, y_test)


Accuracy on training set: 1.000
Accuracy on test set: 0.580


In [None]:
def visualize_tree(tree, X_train):
    """
    Visualize the trained decision tree using matplotlib.

    Args:
        tree (DecisionTreeClassifier): The trained decision tree model.
        X_train (pd.DataFrame): The training feature set.
    
    Returns:
        None
    """
    plt.figure(figsize=(12, 8))
    
    # Modify class names based on unique values in y
    plot_tree(tree, 
              filled=True, 
              feature_names=X_train.columns, 
              class_names=['Class 0', 'Class 1', 'Class 2'])  # Adjust based on unique target values
    
    plt.show()

# Example usage:
visualize_tree(tree, X_train)


In [None]:
# Check the unique values in the target column
print(y.unique())


In [None]:
def visualize_tree(tree, X_train, y_train):
    """
    Visualize the trained decision tree using matplotlib.

    Args:
        tree (DecisionTreeClassifier): The trained decision tree model.
        X_train (pd.DataFrame): The training feature set.
        y_train (pd.Series): The training target variable.
    
    Returns:
        None
    """
    plt.figure(figsize=(12, 8))
    
    # Get unique classes in y_train and set appropriate class names
    unique_classes = y_train.unique()
    
    # Define the class names based on the number of unique classes
    if len(unique_classes) == 2:
        class_names = ['No Heart Disease', 'Heart Disease']
    elif len(unique_classes) == 3:
        class_names = ['Class 0', 'Class 1', 'Class 2']  # Customize as per your case
    else:
        class_names = [f'Class {int(cls)}' for cls in unique_classes]  # For multi-class
    
    # Plot the decision tree
    plot_tree(tree, 
              filled=True, 
              feature_names=X_train.columns, 
              class_names=class_names)  
    
    plt.show()

# Example usage:
visualize_tree(tree, X_train, y_train)


In [None]:
def predict_new_data(tree, new_data):
    """
    Use the trained decision tree to predict on new data.

    Args:
        tree (DecisionTreeClassifier): The trained decision tree model.
        new_data (pd.DataFrame): The new data to predict.
    
    Returns:
        np.ndarray: Predicted labels for the new data.
    """
    return tree.predict(new_data)

# Example usage (using X_test for demonstration):
predictions = predict_new_data(tree, X_test)

# Display predictions
predictions


In [None]:
predictions