# Decision Tree - Heart Disease

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Loading and Processing Data

In [2]:
# Import cleaned preprocessed data
heart_data = pd.read_csv("./heartdisease.csv")
heart_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,2
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [3]:
# Further clean data - remove "Unnamed: 0" column
heart_data = heart_data.drop(columns=['Unnamed: 0'])

# Verify
heart_data.columns

Index(['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure',
       'Cholesterol', 'Fasting Blood Sugar', 'Resting EKG', 'Max Heart Rate',
       'Exercise Enduced Chest Pain', 'STDep Induced by Exercise',
       'Slope of Peak Exercise ST', 'Diagnosis'],
      dtype='object')

In [4]:
# Change values
heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [5]:
# Change Diagnosis column so we only have 0 and 1 (absense and presence)
# Copy data
heart_new = heart_data.copy()

heart_new['Diagnosis'].head()

0    0
1    1
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [6]:
#create data frame for yes/no diagnosis
dgHeartData = heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [19]:
# Define features set
X = heart_data.copy()
X.drop('Diagnosis', axis=1, inplace=True)
X.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0


In [20]:
# Define target vector
y = heart_data['Diagnosis'].values.reshape(-1, 1)
y[:5]

array([[0],
       [1],
       [1],
       [0],
       [0]])

In [21]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [22]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [23]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [24]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Decision Tree Model

In [25]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [26]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

# Making Predictions Using the Tree Model

In [27]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Model Evaluation

In [28]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,36,20
Actual 1,26,44


Accuracy Score : 0.6349206349206349
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.64      0.61        56
           1       0.69      0.63      0.66        70

    accuracy                           0.63       126
   macro avg       0.63      0.64      0.63       126
weighted avg       0.64      0.63      0.64       126

