In [2]:
import numpy as np
import pandas as pd

## modelling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('data/Healthcare-Diabetes.csv')

In [4]:
data = data.drop('Id', axis = 1)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X = data.drop('Outcome',axis=1)
Y = data[['Outcome']]

In [6]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

# Numerical Pipeline
num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

# Categorical Pipeline
cat_pipeline = Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('one_hot_encoder',OneHotEncoder()),
                ('scaler',StandardScaler())
                ]
            )

preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,numerical_cols),
                ('cat_pipeline',cat_pipeline,categorical_cols)
                ]
            )

In [7]:

xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.3,random_state=123)

In [8]:
xtrain = pd.DataFrame(preprocessor.fit_transform(xtrain),columns=preprocessor.get_feature_names_out())
xtest = pd.DataFrame(preprocessor.transform(xtest),columns=preprocessor.get_feature_names_out())

In [9]:
preprocessor.get_feature_names_out()

array(['num_pipeline__Pregnancies', 'num_pipeline__Glucose',
       'num_pipeline__BloodPressure', 'num_pipeline__SkinThickness',
       'num_pipeline__Insulin', 'num_pipeline__BMI',
       'num_pipeline__DiabetesPedigreeFunction', 'num_pipeline__Age'],
      dtype=object)

In [10]:
xtrain.head()

Unnamed: 0,num_pipeline__Pregnancies,num_pipeline__Glucose,num_pipeline__BloodPressure,num_pipeline__SkinThickness,num_pipeline__Insulin,num_pipeline__BMI,num_pipeline__DiabetesPedigreeFunction,num_pipeline__Age
0,0.965776,-0.469621,-0.453628,0.205004,-0.712578,-0.680596,-0.53575,-0.355539
1,0.665116,-0.870066,-0.961418,0.57308,-0.139397,-0.40803,-0.348215,-0.85902
2,1.266436,-0.931673,0.663508,-1.2673,-0.712578,0.446836,0.373796,2.917089
3,0.063795,0.608499,0.257276,-1.2673,-0.712578,-0.544313,-0.698281,0.56751
4,-0.537525,0.546892,0.308055,-1.2673,-0.712578,-0.7921,-0.938951,-0.355539


In [11]:
xtest.head()

Unnamed: 0,num_pipeline__Pregnancies,num_pipeline__Glucose,num_pipeline__BloodPressure,num_pipeline__SkinThickness,num_pipeline__Insulin,num_pipeline__BMI,num_pipeline__DiabetesPedigreeFunction,num_pipeline__Age
0,-1.138846,-0.038373,0.257276,-0.163072,-0.148353,-0.185022,-0.570132,-0.607279
1,-1.138846,1.809834,-0.148955,1.125194,-0.712578,1.239755,4.455812,-0.691193
2,0.063795,1.532603,0.155719,-1.2673,-0.712578,1.437984,0.036232,-0.607279
3,-0.838186,-0.438818,-0.961418,-0.101726,-0.712578,-0.457588,-0.895193,-0.355539
4,1.567097,-1.516939,0.460392,0.26635,-0.712578,-0.048739,-0.58576,0.399683


In [14]:
def model_evaluation (true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmatrix = confusion_matrix(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    return accuracy,confusionmatrix,precision,recall

In [24]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

model_list = []
accuracy_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(xtrain, ytrain.values.flatten()) # Train model
    
    # Make predictions
    y_train_pred = model.predict(xtrain)
    y_test_pred = model.predict(xtest)
    
    # Evaluate Train and Test dataset
    accuracy_train, confusiomatrix_train, precision_train, recall_train = model_evaluation(ytrain, y_train_pred)

    accuracy_test, confusiomatrix_test, precision_test, recall_test = model_evaluation(ytest, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Train set')
    print("- Accuracy: {:.4f}".format(accuracy_train))
    print("- Confusion Matrix: {}".format(confusiomatrix_train))
    print("- Precision: {:.4f}".format(precision_train))
    print("- Recall: {:.4f}".format(recall_train))
    

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy_test))
    print("- Confusion Matrix: {}".format(confusiomatrix_test))
    print("- Precision: {:.4f}".format(precision_test))
    print("- Recall: {:.4f}".format(recall_test))
    accuracy_list.append(accuracy_test)
    
    print('='*35)
    print('\n')
    

Decision Tree
Model performance for Train set
- Accuracy: 1.0000
- Confusion Matrix: [[1271    0]
 [   0  666]]
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9759
- Confusion Matrix: [[537   8]
 [ 12 274]]
- Precision: 0.9716
- Recall: 0.9580


KNN
Model performance for Train set
- Accuracy: 0.9241
- Confusion Matrix: [[1204   67]
 [  80  586]]
- Precision: 0.8974
- Recall: 0.8799
----------------------------------
Model performance for Test set
- Accuracy: 0.8075
- Confusion Matrix: [[476  69]
 [ 91 195]]
- Precision: 0.7386
- Recall: 0.6818


Logistic Regression
Model performance for Train set
- Accuracy: 0.7780
- Confusion Matrix: [[1142  129]
 [ 301  365]]
- Precision: 0.7389
- Recall: 0.5480
----------------------------------
Model performance for Test set
- Accuracy: 0.7750
- Confusion Matrix: [[483  62]
 [125 161]]
- Precision: 0.7220
- Recall: 0.5629




In [23]:
df_results = pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=["Accuracy"],ascending=False)
df_result

Unnamed: 0,Model Name,Accuracy
0,Decision Tree,0.977136
1,KNN,0.807461
2,Logistic Regression,0.77497
