In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


In [2]:
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

df = df.dropna()

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"NB - Final dataset shape after scaling: {X_scaled.shape}")


📊 NB - Final dataset shape after scaling: (3268, 13)


In [4]:

nb = GaussianNB()

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]  # Vary smoothing for more stability
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(nb, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)


In [5]:

grid_search.fit(X_scaled, y)

best_nb = grid_search.best_estimator_

print("Best Parameters for Naive Bayes:", grid_search.best_params_)
print("Best Accuracy with Best Parameters:", grid_search.best_score_)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


Best Parameters for Naive Bayes: {'var_smoothing': 1e-09}
Best Accuracy with Best Parameters: 0.7255274409804665


In [None]:

y_pred = best_nb.predict(X_scaled)

print("NB Classification Report on Training Data:")
print(classification_report(y, y_pred))

print("NB - Confusion Matrix on Training Data:")
print(confusion_matrix(y, y_pred))


📌 NB Classification Report on Training Data:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70      1638
           1       0.69      0.81      0.75      1630

    accuracy                           0.73      3268
   macro avg       0.73      0.73      0.73      3268
weighted avg       0.74      0.73      0.73      3268

🟦 NB - Confusion Matrix on Training Data:
[[1056  582]
 [ 306 1324]]


In [None]:

valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"NB - Validation dataset shape: {X_valid.shape}")


🧪 NB - Validation dataset shape: (364, 13)


In [8]:

y_pred_valid = best_nb.predict(X_valid_scaled)


In [None]:

print(f"NB - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("NB - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


📌 NB - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.66      0.72       178
           1       0.72      0.85      0.78       186

    accuracy                           0.76       364
   macro avg       0.76      0.75      0.75       364
weighted avg       0.76      0.76      0.75       364

🟦 NB - Validation Confusion Matrix:
[[117  61]
 [ 28 158]]
