In [14]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

df = df.dropna()

# Define features and target
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Naïve Bayes classifier
nb = GaussianNB()

### K-Fold Cross Validation ###
kf = KFold(n_splits=5, shuffle=True, random_state=42)
nb_kfold_scores = cross_val_score(nb, X_scaled, y, cv=kf, scoring='accuracy')

print(f"Naïve Bayes (K-fold) Mean Accuracy: {nb_kfold_scores.mean():.4f}")

### Train-Test Splitting ###
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train and evaluate
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Evaluation metrics
print(f"Naïve Bayes (Splitting) Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Naïve Bayes Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Naïve Bayes (K-fold) Mean Accuracy: 0.7795
Naïve Bayes (Splitting) Accuracy: 0.8000
Naïve Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76        78
           1       0.84      0.81      0.83       112

    accuracy                           0.80       190
   macro avg       0.79      0.80      0.79       190
weighted avg       0.80      0.80      0.80       190

Confusion Matrix:
[[61 17]
 [21 91]]


In [15]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

print(f"🔍 NB - Original dataset shape: {df.shape}")

# Drop rows with NaN
df = df.dropna()

print(f"🧹 NB - After dropping NaNs: {df.shape}")

# Define features and target
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"📊 NB - Final dataset shape after scaling: {X_scaled.shape}")

# Initialize Naïve Bayes model
nb = GaussianNB()

# Set up hyperparameter grid
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Use GridSearchCV with K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(nb, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_scaled, y)

# Get the best model
best_nb = grid_search.best_estimator_

# Evaluate on the full dataset
y_pred = best_nb.predict(X_scaled)

# Print best parameters & results
print(f"✅ NB - Best Parameters: {grid_search.best_params_}")
print(f"✅ NB - K-Fold Mean Accuracy: {grid_search.best_score_:.4f}")
print("📌 NB Classification Report:")
print(classification_report(y, y_pred))
print("🟦 NB Confusion Matrix:")
print(confusion_matrix(y, y_pred))


🔍 NB - Original dataset shape: (4242, 13)
🧹 NB - After dropping NaNs: (948, 13)
📊 NB - Final dataset shape after scaling: (948, 12)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
✅ NB - Best Parameters: {'var_smoothing': 1e-09}
✅ NB - K-Fold Mean Accuracy: 0.7795
📌 NB Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.79      0.75       380
           1       0.85      0.78      0.81       568

    accuracy                           0.78       948
   macro avg       0.78      0.79      0.78       948
weighted avg       0.79      0.78      0.79       948

🟦 NB Confusion Matrix:
[[302  78]
 [126 442]]


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

# File path for validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

### 🎯 VALIDATION PHASE ###
# Load validation data
df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

# Define features and target
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Standardize validation data (using the same scaler from training)
X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 NB - Validation dataset shape: {X_valid.shape}")

# Evaluate on validation set
y_pred_valid = best_nb.predict(X_valid_scaled)

# Classification Report
print(f"📌 NB - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Confusion Matrix
print("🟦 NB - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 NB - Validation dataset shape: (124, 12)
📌 NB - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77        45
           1       0.87      0.86      0.87        79

    accuracy                           0.83       124
   macro avg       0.82      0.82      0.82       124
weighted avg       0.83      0.83      0.83       124

🟦 NB - Validation Confusion Matrix:
[[35 10]
 [11 68]]
