In [1]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced']  # Add class weight as an option
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(dt, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

grid_search.fit(X, y)

best_dt = grid_search.best_estimator_

y_pred = best_dt.predict(X)

print("Best Parameters for Decision Tree:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)


print("Decision Tree Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best Parameters for Decision Tree: {'class_weight': 'balanced', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mean Accuracy with Best Parameters: 0.8114057603840257
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      2138
           1       0.83      0.79      0.81      2104

    accuracy                           0.82      4242
   macro avg       0.82      0.82      0.82      4242
weighted avg       0.82      0.82      0.82      4242

Confusion Matrix:
[[1805  333]
 [ 433 1671]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# File path for validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

### 🎯 VALIDATION PHASE ###
# Load validation data
df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

# Define features and target
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Standardize validation data (using the same scaler from training)
X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 DT - Validation dataset shape: {X_valid.shape}")

# Evaluate on validation set
y_pred_valid = best_dt.predict(X_valid_scaled)

# Classification Report
print(f"📌 DT - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Confusion Matrix
print("🟦 NN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 NN - Validation dataset shape: (124, 12)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.98      0.83        45
           1       0.98      0.78      0.87        79

    accuracy                           0.85       124
   macro avg       0.85      0.88      0.85       124
weighted avg       0.89      0.85      0.86       124

🟦 NN - Validation Confusion Matrix:
[[44  1]
 [17 62]]
