In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("diabetes.csv")
df.head(10)
df.info()
df['Outcome'].value_counts()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe().T.style.background_gradient(cmap='hot_r')

In [None]:
corr=df.corr() 
sns.heatmap(corr, annot=True, cmap="YlGnBu") # an array of the same shape as data which is used to annotate the heatmap

In [None]:
ax = sns.distplot(df["Glucose"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["BloodPressure"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["SkinThickness"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["Insulin"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["BMI"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["Age"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
ax = sns.distplot(df["DiabetesPedigreeFunction"])
plt.show() # Plot histogram to show distributions of dataset

In [None]:
# pair plot to visulize pair-wise relationships to see how columns are related
sns.pairplot(df)

In [None]:
ax = sns.countplot(x="Outcome", data=df, palette='coolwarm')

In [None]:
ax = sns.countplot(x="Pregnancies", data=df, palette='coolwarm')

In [None]:
sns.displot(data=df, x="Age", y="Glucose")

In [None]:
# split dataset in features and target variable
feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure','DiabetesPedigreeFunction']
X = df[feature_cols] # Features
y = df.Outcome # Target variable

In [None]:
# Split dataset into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K Nearest Neighbours

In [None]:
def calculate_precision(tp,fp):
    return tp/(tp+fp)

def calculate_recall(tp,fn):
    return tp/(tp+fn)

def calculte_harmonic_mean(precision,recall):
    return 2 * precision * recall/(precision + recall)

def calculate_specificity(tn,fp):
    return tn/tn+fp

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train,y_train)

In [None]:
KNeighborsClassifier(n_neighbors=3)

knn_pred = knn_classifier.predict(X_test)

knn_pred

In [None]:
y_test

In [None]:
knn_conf_mat = confusion_matrix(y_test,knn_pred)
knn_conf_mat

In [None]:
precision = calculate_precision(knn_conf_mat[0][0],knn_conf_mat[1][0])
recall = calculate_recall(knn_conf_mat[0][0],knn_conf_mat[0][1])
harmonic_mean = calculte_harmonic_mean(precision,recall)
specificity = calculate_specificity(knn_conf_mat[1][1],knn_conf_mat[1][0])
print("Precision:" +str(precision))
print("Accuracy %.4f" % accuracy_score(y_test, knn_pred))
print("Recall:" +str(recall))
print("F1 Score:" +str(harmonic_mean))
print("Specificity Score:" + str(specificity))

# Decision Tree Classifier

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_preds = clf.predict(X_test)

In [None]:
print("Accuracy %.4f" % accuracy_score(y_test, y_preds))
print("Precision %.4f" % precision_score(y_test, y_preds))
print("Recall %.4f" % recall_score(y_test, y_preds))
print("F1 %.4f" % f1_score(y_test, y_preds))

In [None]:
plt.figure(figsize=(15,10))
plot_tree(clf, 
          feature_names = list(X.columns),
          class_names = ['No Diabetes', 'Diabetes'],
          filled = True)
plt.show()

# Grid Search

In [None]:
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],'min_samples_split': [2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X_train,y_train)
print("tuned hpyerparameters :(best parameters) :",clf.best_params_)
print(clf.best_estimator_)

In [None]:
dt_model = DecisionTreeClassifier(criterion="gini",max_depth=120,min_samples_split=3)
dt_model.fit(X_train,y_train)

In [None]:
DecisionTreeClassifier(max_depth=120, min_samples_split=3)

In [None]:
print(confusion_matrix(y_test,dt_model.predict(X_test)))
sns.heatmap(confusion_matrix(y_test,dt_model.predict(X_test)), annot=True)
plt.show()

In [None]:
print("Accuracy %.4f" % accuracy_score(y_test, dt_model.predict(X_test)))
print("Precision %.4f" % precision_score(y_test, dt_model.predict(X_test)))
print("Recall %.4f" % recall_score(y_test, dt_model.predict(X_test)))
print("F1 %.4f" % f1_score(y_test, dt_model.predict(X_test)))