# Decision Tree - Heart Disease - to determine top Features

In [None]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Loading and Processing Data

In [None]:
# Import cleaned preprocessed data
heart_data = pd.read_csv("resources/heartdisease.csv")
heart_data.head()

In [None]:
# Further clean data - remove "Unnamed: 0" column
heart_data = heart_data.drop(columns=['Unnamed: 0'])

# Verify
heart_data.columns

In [None]:
# Change values
heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data['Diagnosis'].head()

In [None]:
# Change Diagnosis column so we only have 0 and 1 (absense and presence)
# Copy data
heart_new = heart_data.copy()

heart_new['Diagnosis'].head()

In [None]:
#create data frame for yes/no diagnosis
dgHeartData = heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data.head()

In [None]:
# Define features set
X = heart_data.copy()
X.drop('Diagnosis', axis=1, inplace=True)
X.head()

In [None]:
# Define target vector
y = heart_data['Diagnosis'].values.reshape(-1, 1)
y[:5]

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the Decision Tree Model

In [None]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [None]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

# Making Predictions Using the Tree Model

In [None]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Model Evaluation

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# Fitting the Random Forest Model

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making Predictions Using the Random Forest Model

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

# Model Evaluation

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# Feature Importance - Max Heart Rate and Age

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

In [None]:
# Determine the number of unique values in each column.
uniqueValues = heart_data.nunique()
uniqueValues

In [None]:
# Look at  value counts to identify and delete.
MaxRate_Type = heart_data['Max Heart Rate'].value_counts()
Sorted_MaxRate = MaxRate_Type.sort_values(ascending=False)
Sorted_MaxRate.head()

# Pre-Process Data and create Standard Scaler

In [None]:
#Look at Max Rate frequency less than 53 to remove 0 values. 
Sorted_MaxRate53 = Sorted_MaxRate[Sorted_MaxRate < 53]
Sorted_MaxRate53.sort_values(ascending=False)

In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
MaxRate_cutoff = 19
mr19 = heart_data['Max Heart Rate'].value_counts()
mr_to_replace = list(mr19[mr19 > MaxRate_cutoff].index)

# Replace in dataframe
for mr in mr_to_replace:
    heart_data['Max Heart Rate'] = heart_data['Max Heart Rate'].replace(mr,"Other")

# Check to make sure replacement was successful
heart_data['Max Heart Rate'].value_counts()

In [None]:
# Look at  value counts to identify and delete.
ages = heart_data['Age'].value_counts()
Sorted_Ages = ages.sort_values(ascending=False)
Sorted_Ages.head()

In [None]:
#Convert categorical Data to numeric with 'pd.get_dummies'
converted_heart_data = pd.get_dummies(heart_data)
converted_heart_data.head()

In [None]:
## Split our preprocessed data into our features and target arrays
X = converted_heart_data.drop('Diagnosis', axis=1).values
y = converted_heart_data['Diagnosis'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Run Neural Net for Feature importances - Max Heart Rate and Age

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 90
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation = 'relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='leaky_relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
fitModel = nn.fit(X_train_scaled, y_train, epochs=300)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")