# Exercise Notebook - class 7

## Import Packages

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score,  roc_curve, auc, make_scorer

## Read Data

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/Hospital-Da-Luz-Learning-Health/MLCatolica24/main/Aula%207%20-%20Supervised%20Learning%20III/data/heart_disease.csv')

In [None]:
data.head()

In [None]:
data.shape

## EDA

In [None]:
data.describe().transpose()

In [None]:
hists = data.hist(bins=35, figsize=(15,15))
plt.show()

## Decision Tree Training

Data selection

In [None]:
X = data.drop('Diagnosed', axis=1)  # Independent variables
y = data['Diagnosed']  # Dependent variable (target)

Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Decision Tree

Define different values for the attributes and interpret the results

In [None]:
#max_depth_value = ...
#min_samples_leaf_value = ...
#min_samples_split_value = ...

In [None]:
dtree=DecisionTreeClassifier(max_depth=max_depth_value, min_samples_leaf=min_samples_leaf_value, min_samples_split=min_samples_split_value, random_state=0)
dtree.fit(X_train,y_train)

Performance on the train set

In [None]:
# check the accuracy 
#y_pred_train = dtree....
accuracy = accuracy_score(y_train, y_pred_train)
print(f'Accuracy: {accuracy:.2f}')

Performance on the test set

In [None]:
# Make predictions on the test set
# y_predict = dtree....

In [None]:
accuracy = accuracy_score(y_test, y_predict)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
# Plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_predict)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=dtree.classes_)
disp.plot()
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Visualize the Decision Tree - use plt_tree() function
plt.figure(figsize=(20, 10))
#plot_tree(...)
plt.title('Decision Tree')
plt.show()

Check the feature importance

In [None]:
# Retrieve feature importance
# feature_importances = dtree....

In [None]:
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot feature importances
plt.figure(figsize=(8, 6))
#plt.barh(...)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances in Decision Tree Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.grid(True)
plt.show()

## Random Forest Training

Define the Random Forest Parameters you want...

In [None]:
#n_estimators_value = 
# max_depth_value = ...
#min_samples_leaf_value = ...
#min_samples_split_value = ...

In [None]:
# Train the Random Forest model 
rf_classifier = RandomForestClassifier(n_estimators=n_estimators_value, max_depth=max_depth_value, min_samples_split=min_samples_leaf_value, min_samples_leaf=min_samples_split_value, random_state=42)
rf_classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the train set
#y_predicted_train = rf_classifier....

accuracy = accuracy_score(y_train, y_predicted_train)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
# Make predictions on the test set
# y_predicted = rf_classifier....

In [None]:
accuracy = accuracy_score(y_test, y_predicted)
print(f'Accuracy: {accuracy:.2f}')

You can visualize each tree individually

In [None]:
# Select tree number 50 from the forest --> use attribute estimators from the RF model
#one_tree = rf_classifier....

# Visualize the selected tree
plt.figure(figsize=(20, 10))
plot_tree(one_tree, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'], filled=True, rounded=True, fontsize=10)
plt.title('Decision Tree from Random Forest')
plt.show()

Compute the features importance of the RF model

In [None]:
# Calculate feature importance --> use attribute feature_importances_ of the RF model
# feature_importances = rf_classifier....

In [None]:
# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot feature importance
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance in Random Forest')
plt.gca().invert_yaxis()
plt.show()