In [1]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score 
from sklearn.neighbors import KNeighborsClassifier



In [2]:
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

df = df.dropna()

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


In [3]:

scaler = StandardScaler()
X = scaler.fit_transform(X)


In [4]:

knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [5, 7, 9, 11, 13, 15, 17, 19], 
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(knn, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)


In [5]:

grid_search.fit(X, y)

best_knn = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


{'n_neighbors': 15}


In [6]:
y_pred = best_knn.predict(X)

print("Best Parameters for KNN:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)

print("KNN Classification Report on Training Data:")
print(classification_report(y, y_pred))

print("KNN - Confusion Matrix on Training Data:")
print(confusion_matrix(y, y_pred))


Best Parameters for KNN: {'n_neighbors': 15}
Mean Accuracy with Best Parameters: 0.7322552697266438
KNN Classification Report on Training Data:
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      1638
           1       0.73      0.82      0.78      1630

    accuracy                           0.76      3268
   macro avg       0.77      0.76      0.76      3268
weighted avg       0.77      0.76      0.76      3268

KNN - Confusion Matrix on Training Data:
[[1148  490]
 [ 287 1343]]


In [7]:

valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)


In [8]:
y_pred_valid = best_knn.predict(X_valid_scaled)


In [None]:
print(f"KNN - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("KNN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


📌 KNN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.67      0.70       178
           1       0.71      0.77      0.74       186

    accuracy                           0.72       364
   macro avg       0.72      0.72      0.72       364
weighted avg       0.72      0.72      0.72       364

🟦 KNN - Validation Confusion Matrix:
[[120  58]
 [ 43 143]]
