In [2]:
# Common imports
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import precision_score, f1_score, recall_score, confusion_matrix, classification_report, accuracy_score

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [3]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
data = pd.read_csv('../input/fetal-health-classification/fetal_health.csv')


In [5]:
#highly correlated features
columns = ["prolongued_decelerations","abnormal_short_term_variability"]
data1 = pd.DataFrame(data,columns = columns)
X = data1.to_numpy()
y = data["fetal_health"]

In [6]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
tree_clf.fit(X, y)

In [7]:
#plot tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(tree_clf, filled=True)
plt.savefig('decision_tree.png')
plt.show()

In [8]:
from sklearn.tree import export_graphviz

def image_path(fig_id):
    return os.path.join(IMAGES_PATH, fig_id)

export_graphviz(
        tree_clf,
        out_file=image_path("fetal_health.dot"),
        feature_names=columns,
        class_names="fetal_health",
        rounded=True,
        filled=True
    )

In [9]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-0.001, 0.007, 0, 90], iris=True, legend=False, plot_training=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if not iris:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap="Paired", alpha=0.8)
    if plot_training:
        plt.plot(X[:, 0][y==1.0], X[:, 1][y==1.0], "y*", label="Normal")
        plt.plot(X[:, 0][y==2.0], X[:, 1][y==2.0], "o", label="Suspect")
        plt.plot(X[:, 0][y==3.0], X[:, 1][y==3.0], "D", label="Pathological")
        plt.axis(axes)
    if iris:
        plt.xlabel("prolongued_decelerations", fontsize=14)
        plt.ylabel("abnormal_short_term_variability", fontsize=14)
    else:
        plt.xlabel(r"$x_1$", fontsize=18)
        plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
    if legend:
        plt.legend(loc="lower right", fontsize=14)

plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)

save_fig("decision_tree_decision_boundaries_plot")
plt.show()

# Predicting classes and class probabilities
tree_clf.predict([[0.2, 45]])
tree_clf.predict_proba([[0.0, 70]])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y,random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
y_pred = tree_clf.predict(X_test)

In [11]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, square=True, annot=True,linewidths=1,cmap="BuGn")
print("Confusion Matrix -\n", cm)

In [12]:
#predicting accuracy of the decision tree classifier

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred,average='macro')

result = pd.DataFrame([['Decision Tree', accuracy, precision, recall, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
result

In [13]:
tree_clf.score(X_test, y_test)

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_clf, X_train, y_train, cv=5)
np.average(scores)

In [15]:
scores = cross_val_score(tree_clf, X_test, y_test, cv=5)
np.average(scores)