In [14]:
# MN is using Kernel Python 3.12.1 for this module, and within the Codespace named:
# jubilant space system

# In this weeks Python sessions we’ll look at how we can feed our data into some ML code to get
# a decision tree going which can help us with categorisation problems, but in a way that’s maybe,
# more visually tractable.

# Here’s a link, again, to the google developers course on machine learning, but specifically to
# decision trees: https://developers.google.com/machine-learning/decision-forests/decision-trees
# It’s not required reading but it’ll help and will clear up things that I’m not always the best
# at explaining live, whilst also trying to remember code!

In [15]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.impute import SimpleImputer

In [None]:
col_names = ['pregancies', 'glu', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

df = pd.read_csv('https://raw.githubusercontent.com/data-to-insight/ERN-sessions/main/data/diabetes_data.csv',
                 names=col_names,
                 skiprows=1)
 
df.head()

In [None]:
imp = SimpleImputer(strategy='mean')
 
error_cols = col_names[1:-1]
 
df[error_cols] =df[error_cols].replace(0, np.nan)
 
for column in error_cols:
    df[column] = imp.fit_transform(df[column].values.reshape(-1, 1))
 
df.head(10)

In [None]:
features = col_names[:-1]
X = df[features]
y = df['label']

X.head(10)

# https://scikit-learn.org/stable/modules/impute.html

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)
 
clf = DecisionTreeClassifier()
 
clf.fit(X_train, y_train)
 
y_pred = clf.predict(X_test)

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
 
target_values = ['no diabetes', 'diabetes']
 
sns.heatmap(cm,
            annot=True,
            xticklabels=target_values,
            yticklabels=target_values)
plt.xlabel('Predicted values')
plt.ylabel('Real values')
plt.title('Confusion Matrix')

In [None]:
print(f'Diabetes precision {51/82*100}')
print(f'Diabetes recall {51/85*100}')
print()
print(f'No Diabetes precision {120/154*100}')
print(f'No Diabetes recall {120/151*100}')

In [None]:
from sklearn.metrics import classification_report
output_report = classification_report(y_test, y_pred, target_names=target_values,
                                      output_dict=True)
 
output_report

In [None]:
# this last bit of code has previously had a problem, "GraphViz's executables not found"

# The notes on this problem are on MN home PC see:
# D:\Mark\work\04 ESCC\a training\2024 Python\Kernel issues\

# IN THE TERMINAL, run these 3 lines 1-by-1
# pip install graphviz
# pip install pydotplus
# conda install graphviz (not sure about this one)

# alternatively, in the codespace write the following:
# %pip install graphviz
# %pip install pydotplus
# %conda install graphviz
# MN is not sure whether this is a valid alternative to installing the libraries in the Terminal

# "we have to install graphviz twice, this is how I could get it to work,
# I don't know why this is the case" Will

from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus


y_pred = clf.predict(X_test)

dot_data = StringIO()
export_graphviz(clf,
                out_file=dot_data,
                filled=True,
                rounded=True,
                special_characters=True,
                feature_names=features,
                class_names=target_values)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

graph.write_png('diabetes.png')
Image(graph.create_png())

In [None]:
# accuracy': 0.7186147186147186 ??

clf = DecisionTreeClassifier(max_depth=3, criterion='entropy')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

output_report

In [26]:
# The second attribute selection measure we looked at was entropy.
# Entropy is a measure of how homogenous the samples in a node are, or how random
# the samples in the node are. It can be though of as how messy the data is in a given node,
# or how much information our node gives us about the outcomes we want to predict.

# High entropy means the system is messy and doesn’t give us a lot of information about a prediction.
# When we use entropy as our splitting criteria, which we did for the second plot, this choses
# to split nodes will maximise the decrease in average entropy between
# nodes (this difference in average entropy is called information gain).
# Essentially then, when choosing entropy as our attribute selection method,
# we split nodes where, after that split, we have a more homogenous set of samples,
# which can be though of as having a better set of samples from which to make a prediction.