In [5]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

# Drop rows with NaN in any feature column
df = df.dropna()

# Define features and target
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features (important for KNN)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the KNN model
knn = KNeighborsClassifier()

# Set up the hyperparameter grid for tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting function
    'metric': ['euclidean', 'manhattan']  # Distance metric
}

# Use GridSearchCV with KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

# Fit GridSearchCV on the data
grid_search.fit(X, y)

# Get the best model
best_knn = grid_search.best_estimator_

# Evaluate the model with the best parameters
y_pred = best_knn.predict(X)

# Print the best hyperparameters and the mean accuracy from cross-validation
print("Best Parameters for KNN:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)

# Classification report and confusion matrix
print("KNN Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Mean Accuracy with Best Parameters: 0.7510164299637984
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.75      0.76       380
           1       0.84      0.85      0.84       568

    accuracy                           0.81       948
   macro avg       0.80      0.80      0.80       948
weighted avg       0.81      0.81      0.81       948

Confusion Matrix:
[[286  94]
 [ 88 480]]


In [6]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

print(f"🔍 KNN - Original dataset shape: {df.shape}")

# Drop NaN rows
df = df.dropna()

print(f"🧹 KNN - After dropping NaNs: {df.shape}")

# Define features and target
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

print(f"📊 KNN - Final dataset shape after scaling: {X.shape}")

# Initialize the KNN model
knn = KNeighborsClassifier()

# Set up the hyperparameter grid for tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  
    'weights': ['uniform', 'distance'],  
    'metric': ['euclidean', 'manhattan']  
}

# Use GridSearchCV with KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(knn, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

# Fit GridSearchCV on the data
grid_search.fit(X, y)

# Get the best model
best_knn = grid_search.best_estimator_

# Evaluate the model
y_pred = best_knn.predict(X)

# Print best hyperparameters & accuracy
print("✅ KNN - Best Parameters:", grid_search.best_params_)
print("✅ KNN - Mean Accuracy:", grid_search.best_score_)

# Classification Report
print("📌 KNN Classification Report:")
print(classification_report(y, y_pred))
print("🟦 KNN Confusion Matrix:")
print(confusion_matrix(y, y_pred))


🔍 KNN - Original dataset shape: (4242, 13)
🧹 KNN - After dropping NaNs: (948, 13)
📊 KNN - Final dataset shape after scaling: (948, 12)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
✅ KNN - Best Parameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
✅ KNN - Mean Accuracy: 0.7510164299637984
📌 KNN Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.75      0.76       380
           1       0.84      0.85      0.84       568

    accuracy                           0.81       948
   macro avg       0.80      0.80      0.80       948
weighted avg       0.81      0.81      0.81       948

🟦 KNN Confusion Matrix:
[[286  94]
 [ 88 480]]


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

# File path for validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

### 🎯 VALIDATION PHASE ###
# Load validation data
df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

# Define features and target
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Standardize validation data (using the same scaler from training)
X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 KNN - Validation dataset shape: {X_valid.shape}")

# Evaluate on validation set
y_pred_valid = best_knn.predict(X_valid_scaled)

# Classification Report
print(f"📌 KNN - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Confusion Matrix
print("🟦 KNN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 KNN - Validation dataset shape: (124, 12)
📌 KNN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.62      0.66        45
           1       0.80      0.85      0.82        79

    accuracy                           0.77       124
   macro avg       0.75      0.74      0.74       124
weighted avg       0.76      0.77      0.76       124

🟦 KNN - Validation Confusion Matrix:
[[28 17]
 [12 67]]
