# KNN - Predict whether a person will have diabetes or not

# Import all dependencies

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load diabetes.csv

In [26]:
df = pd.read_csv('diabetes.csv')


## Examine the dataset

In [27]:
df.dropna()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Data processing

In [28]:
# Replace zeroes
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for col in zero_not_accepted:
    df[col] = df[col].replace(0, np.nan)
    mean_value = df[col].mean()
    df[col] = df[col].fillna(mean_value)
    
print(df[zero_not_accepted].describe())
df.head()


          Glucose  BloodPressure  SkinThickness         BMI     Insulin
count  768.000000     768.000000     768.000000  768.000000  768.000000
mean   121.686763      72.405184      29.153420   32.457464  155.548223
std     30.435949      12.096346       8.790942    6.875151   85.021108
min     44.000000      24.000000       7.000000   18.200000   14.000000
25%     99.750000      64.000000      25.000000   27.500000  121.500000
50%    117.000000      72.202592      29.153420   32.400000  155.548223
75%    140.250000      80.000000      32.000000   36.600000  155.548223
max    199.000000     122.000000      99.000000   67.100000  846.000000


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


### 1. Compute the mean 
### 2. replace the above columns' value with the mean.

### Split the dataset with Test and Train with test_size=0.2

In [29]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train the model

In [30]:
max_k = -1
max_accuracy = -1
for n in range (1,23):
    knn = KNeighborsClassifier(n_neighbors=n) 
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > max_accuracy:
        max_accuracy = accuracy
        max_k = n

### Predict the model with test data

In [31]:
knn = KNeighborsClassifier(n_neighbors=max_k) 
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

### Evaluate the model

In [32]:
print("Final accuracy:", accuracy_score(y_test, y_pred))

Final accuracy: 0.7532467532467533
