# KNN - Predict whether a person will have diabetes or not

# Import all dependencies

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load diabetes.csv

In [19]:
df = pd.read_csv('diabetes.csv')

## Examine the dataset

In [None]:
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nBasic info:")
print(df.info())
print("\nStatistic describe:")
print(df.describe())
print("\nTarget variable distribution:")
print(df['Outcome'].value_counts())

Dataset shape: (768, 9)

Basic info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Statistic describe:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.8

## Data processing

In [23]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

data = df.copy()

### 1. Compute the mean 
### 2. replace the above columns' value with the mean.

In [24]:
for column in zero_not_accepted:
    mean_value = data[data[column] != 0][column].mean()
    data[column] = data[column].replace(0, mean_value)


### Split the dataset with Test and Train with test_size=0.2

In [25]:
scaler = StandardScaler()
X = data.drop('Outcome', axis=1)
y = data['Outcome']

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


### Train the model

In [26]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

### Predict the model with test data

In [27]:
y_pred = knn.predict(X_test)

print("10 result:\t", y_pred[:10])
print("Label:\t\t", y_test.values[:10])

10 result:	 [1 0 0 1 0 0 0 1 0 1]
Label:		 [0 0 0 1 0 0 1 1 0 0]


### Evaluate the model

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix:")
print(cm)


Accuracy: 0.7403

Classification report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       100
           1       0.63      0.61      0.62        54

    accuracy                           0.74       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.74      0.74      0.74       154


Confusion matrix:
[[81 19]
 [21 33]]
