# Decision Tree Exhibition

## Part 1: Decision Tree from Scratch

In [2]:
# Step 0: Import the necessary packages
from from_scratch.decision_tree import DecisionTree, information_gain
from from_scratch.evaluation_metrics import f1_measure, precision_and_recall, confusion_matrix, accuracy
from from_scratch.import_data import load_data, train_test_split

In [3]:
# Step 1: Import diabetes.csv with load_data
features, targets, attribute_names = load_data("diabetes.csv")
train_features, train_targets, test_features, test_targets = train_test_split(features, targets, fraction = 0.85)

In [4]:
# Step 2: Fit a decision tree to the training data
learner = DecisionTree(attribute_names)
learner.fit(train_features, train_targets)

learner.visualize() # visualize tree

0: Glucose == 128.0
1:  BMI == 26.5
2:   Pregnancies == 8.0
3:    DiabetesPedigreeFunction == 0.678
4:     root == 0
4:     Insulin == 59.0
5:      Age == 28.0
6:       root == 1
6:       root == 0
5:      root == 0
3:    BloodPressure == 55.0
4:     root == 1
4:     Age == 54.0
5:      root == 0
5:      DiabetesPedigreeFunction == 0.409
6:       SkinThickness == 0.0
7:        root == 1
7:        Insulin == 0.0
8:         root == 1
8:         root == 1
6:       root == 0
2:   Age == 29.0
3:    SkinThickness == 10.0
4:     BloodPressure == 80.0
5:      DiabetesPedigreeFunction == 0.391
6:       Pregnancies == 4.0
7:        Insulin == 0.0
8:         root == 1
8:         root == 0
7:        root == 0
6:       root == 0
5:      Pregnancies == 2.0
6:       root == 0
6:       root == 1
4:     DiabetesPedigreeFunction == 0.496
5:      BloodPressure == 85.0
6:       Pregnancies == 3.0
7:        root == 0
7:        Insulin == 88.0
8:         root == 0
8:         root == 0
6:       Insulin == 12

In [5]:
# Step 3: Predict labels of testing set and evaluate the decision tree's performance
predictions = learner.predict(test_features)

confusion_mat = confusion_matrix(test_targets, predictions)
accuracy_num = accuracy(test_targets, predictions)
precision, recall = precision_and_recall(test_targets, predictions)
f1_measure_num = f1_measure(test_targets, predictions)

print(f"Confusion Matrix:\n{confusion_mat}\n")
print(f"Accuracy: {accuracy_num}\n")
print(f"Precision: {precision}; Recall: {recall}\n")
print(f"F1_Measure: {f1_measure_num}\n")

Confusion Matrix:
[[56 14]
 [25 21]]

Accuracy: 0.6637931034482759

Precision: 0.6; Recall: 0.45652173913043476

F1_Measure: 0.5185185185185185



## Part 2: Decision Tree with scikit-learn


In [6]:
# Step 0: Import the necessary packages
## For preparing the data and fitting the decision tree
import pandas as pd
import sklearn.tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text

In [7]:
# Step 1: Import and wrangle diabetes.csv
pima = pd.read_csv("diabetes.csv", header = 0)
X = pima[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]]
y = pima["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 1)

In [8]:
# Step 2: Fit decision tree classifier
clf = DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X_train, y_train)

print(export_text(clf, feature_names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"])) # Visualize tree

|--- Glucose <= 127.50
|   |--- BMI <= 26.45
|   |   |--- BMI <= 9.10
|   |   |   |--- Glucose <= 114.50
|   |   |   |   |--- class: 0
|   |   |   |--- Glucose >  114.50
|   |   |   |   |--- class: 1
|   |   |--- BMI >  9.10
|   |   |   |--- DiabetesPedigreeFunction <= 0.67
|   |   |   |   |--- class: 0
|   |   |   |--- DiabetesPedigreeFunction >  0.67
|   |   |   |   |--- DiabetesPedigreeFunction <= 0.71
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- DiabetesPedigreeFunction >  0.71
|   |   |   |   |   |--- class: 0
|   |--- BMI >  26.45
|   |   |--- Age <= 28.50
|   |   |   |--- BMI <= 30.95
|   |   |   |   |--- Pregnancies <= 7.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Pregnancies >  7.00
|   |   |   |   |   |--- class: 1
|   |   |   |--- BMI >  30.95
|   |   |   |   |--- BloodPressure <= 51.00
|   |   |   |   |   |--- BMI <= 34.40
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- BMI >  34.40
|   |   |   |   |   |   |--- BMI <= 48.55
|   |   |  

In [9]:
# Step 3: Predict labels of testing set and evaluate the decision tree's performance
y_predictions = clf.predict(X_test)

confusion_mat2 = metrics.confusion_matrix(y_test, y_predictions)
accuracy_num2 = metrics.accuracy_score(y_test, y_predictions)
precision2, recall2, f1_measure_num2, _ = metrics.precision_recall_fscore_support(y_test, y_predictions)

print(f"Confusion Matrix:\n{confusion_mat2}\n")
print(f"Accuracy: {accuracy_num2}\n")
print(f"Precision: {precision2}; Recall: {recall2}\n")
print(f"F1_Measure: {f1_measure_num2}\n")

Confusion Matrix:
[[54 21]
 [19 22]]

Accuracy: 0.6551724137931034

Precision: [0.73972603 0.51162791]; Recall: [0.72       0.53658537]

F1_Measure: [0.72972973 0.52380952]

