<a href="https://colab.research.google.com/github/Meowmixforme/Kaggle_Machine_Learning_Projects/blob/main/Project%203%20Diabetes%20Risk%20Prediction/Diabetes_Risk_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Diabetes Risk Prediction

Author: James Fothergill

Import packages

In [41]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Import dataset

In [42]:
data = pd.read_csv('diabetes_data_upload.csv')

In [43]:
data

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


Preprocessing steps

In [44]:
{column: len(data[column].unique()) for column in data.columns}

{'Age': 51,
 'Gender': 2,
 'Polyuria': 2,
 'Polydipsia': 2,
 'sudden weight loss': 2,
 'weakness': 2,
 'Polyphagia': 2,
 'Genital thrush': 2,
 'visual blurring': 2,
 'Itching': 2,
 'Irritability': 2,
 'delayed healing': 2,
 'partial paresis': 2,
 'muscle stiffness': 2,
 'Alopecia': 2,
 'Obesity': 2,
 'class': 2}

In [45]:
def preprocess_inputs(df):
    df = df.copy()

    # Binary-encode Gender column
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1}).infer_objects(copy=False)

    # Binary-encode the symptom columns
    for column in df.columns.drop(['Age', 'Gender', 'class']):
        df[column] = df[column].replace({'No': 0, 'Yes': 1}).infer_objects(copy=False)

    # Split df into X and y
    y = df['class']
    X = df.drop('class', axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [46]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [47]:
X_train

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
122,-0.658902,0.740902,-0.994521,1.129159,-0.846747,0.841974,1.104315,-0.560428,-0.870893,1.044966,1.682730,1.068259,1.148247,1.333651,-0.745356,2.165064
168,-0.913060,0.740902,-0.994521,-0.885615,-0.846747,0.841974,-0.905539,-0.560428,-0.870893,-0.956969,-0.594273,1.068259,-0.870893,-0.749821,1.341641,-0.461880
23,0.018852,0.740902,-0.994521,1.129159,1.180990,0.841974,-0.905539,-0.560428,1.148247,1.044966,1.682730,1.068259,-0.870893,-0.749821,-0.745356,-0.461880
13,1.120204,0.740902,1.005510,1.129159,1.180990,0.841974,1.104315,1.784351,1.148247,1.044966,-0.594273,-0.936103,-0.870893,-0.749821,1.341641,2.165064
61,-1.082499,-1.349706,1.005510,1.129159,1.180990,0.841974,1.104315,-0.560428,1.148247,1.044966,-0.594273,1.068259,1.148247,1.333651,-0.745356,-0.461880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,0.018852,0.740902,1.005510,1.129159,1.180990,0.841974,-0.905539,-0.560428,-0.870893,1.044966,-0.594273,-0.936103,1.148247,-0.749821,1.341641,-0.461880
144,1.713239,0.740902,1.005510,1.129159,-0.846747,-1.187685,1.104315,-0.560428,1.148247,1.044966,1.682730,-0.936103,1.148247,-0.749821,-0.745356,-0.461880
72,1.459081,-1.349706,-0.994521,-0.885615,-0.846747,-1.187685,-0.905539,1.784351,-0.870893,-0.956969,-0.594273,-0.936103,-0.870893,-0.749821,-0.745356,-0.461880
235,-1.844973,0.740902,-0.994521,-0.885615,-0.846747,-1.187685,-0.905539,-0.560428,-0.870893,-0.956969,-0.594273,-0.936103,-0.870893,-0.749821,-0.745356,-0.461880


In [48]:
y_train

Unnamed: 0,class
122,Positive
168,Positive
23,Positive
13,Positive
61,Positive
...,...
129,Positive
144,Positive
72,Positive
235,Negative


Training

In [49]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.




                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


Results

In [50]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 92.31%
                   K-Nearest Neighbors: 90.38%
                         Decision Tree: 96.15%
Support Vector Machine (Linear Kernel): 92.31%
   Support Vector Machine (RBF Kernel): 95.51%
                        Neural Network: 96.79%
                         Random Forest: 98.08%
                     Gradient Boosting: 98.08%
