<a href="https://colab.research.google.com/github/Iteba/Data-Science-Practice/blob/main/Diabetes_Prediction_Logistic_Regression_Practice/Diabetes_Prediction_K_Nearest_Neighbor_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
url = 'https://raw.githubusercontent.com/Iteba/Data-Science-Practice/refs/heads/main/Diabetes_Prediction_Logistic_Regression_Practice/diabetes2.csv'
df = pd.read_csv(url)

# Exploration

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


# Preprocessing

In [4]:
X = df.drop('Outcome', axis = 1)
y = df['Outcome']

## Missing Values

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imputer = SimpleImputer(strategy = 'mean')

In [10]:
non_zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [12]:
for column in non_zero_columns:
  X[column] = X[column].replace(0, np.nan)
  mean = X[column].mean(skipna = True)
  X[column] = X[column].replace(np.nan, mean)

## Data Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

## Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [23]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
model = KNeighborsClassifier(n_neighbors = 11, metric = 'euclidean')

In [26]:
model.fit(X_train, y_train)

# Evaluation

## Predictions

In [27]:
predictions = model.predict(X_test)

In [28]:
predictions

array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0])

## Confusion Matrix

In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

In [32]:
confusion_matrix(y_test, predictions)

array([[83, 13],
       [14, 44]])

## Accuracy Score

In [34]:
accuracy = accuracy_score(y_test, predictions)

In [37]:
print('Model Accuracy: ', round(accuracy, 4) * 100, '%')
print(classification_report(y_test, predictions))

Model Accuracy:  82.47 %
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        96
           1       0.77      0.76      0.77        58

    accuracy                           0.82       154
   macro avg       0.81      0.81      0.81       154
weighted avg       0.82      0.82      0.82       154

