**Train Test and Cross Validation**

In [2]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

param_grid = {
    'max_depth': [7],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(dt, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

grid_search.fit(X, y)

best_dt = grid_search.best_estimator_

y_pred = best_dt.predict(X)

print("Best Parameters for Decision Tree:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)


print("Decision Tree Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters for Decision Tree: {'max_depth': 7}
Mean Accuracy with Best Parameters: 0.6188034188034188
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.65      0.73       580
           1       0.72      0.88      0.79       590

    accuracy                           0.77      1170
   macro avg       0.78      0.76      0.76      1170
weighted avg       0.78      0.77      0.76      1170

Confusion Matrix:
[[378 202]
 [ 72 518]]


**Model used on validation data**

In [3]:
from sklearn.metrics import classification_report, confusion_matrix

valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 DT - Validation dataset shape: {X_valid.shape}")

y_pred_valid = best_dt.predict(X_valid_scaled)

print(f"📌 DT - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("🟦 NN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 DT - Validation dataset shape: (92, 11)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.50      0.64        42
           1       0.69      0.94      0.80        50

    accuracy                           0.74        92
   macro avg       0.78      0.72      0.72        92
weighted avg       0.78      0.74      0.72        92

🟦 NN - Validation Confusion Matrix:
[[21 21]
 [ 3 47]]


**Results**

best - 6 depth

🧪 DT - Validation dataset shape: (335, 9)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.56      0.64       145
           1       0.72      0.86      0.78       190

    accuracy                           0.73       335
   macro avg       0.73      0.71      0.71       335
weighted avg       0.73      0.73      0.72       335

🟦 NN - Validation Confusion Matrix:
[[ 81  64]
 [ 27 163]]

5 depth

🧪 DT - Validation dataset shape: (335, 9)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.61      0.65       145
           1       0.73      0.80      0.76       190

    accuracy                           0.72       335
   macro avg       0.72      0.71      0.71       335
weighted avg       0.72      0.72      0.72       335

🟦 NN - Validation Confusion Matrix:
[[ 89  56]
 [ 38 152]]

4 depth

🧪 DT - Validation dataset shape: (335, 9)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.63      0.65       145
           1       0.73      0.78      0.76       190

    accuracy                           0.71       335
   macro avg       0.71      0.70      0.70       335
weighted avg       0.71      0.71      0.71       335

🟦 NN - Validation Confusion Matrix:
[[ 91  54]
 [ 42 148]]

3 depth

🧪 DT - Validation dataset shape: (335, 9)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.55      0.62       145
           1       0.71      0.83      0.77       190

    accuracy                           0.71       335
   macro avg       0.71      0.69      0.69       335
weighted avg       0.71      0.71      0.70       335

🟦 NN - Validation Confusion Matrix:
[[ 80  65]
 [ 32 158]]

 7 depth

 🧪 DT - Validation dataset shape: (335, 9)
📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.70      0.66       145
           1       0.75      0.69      0.72       190

    accuracy                           0.69       335
   macro avg       0.69      0.69      0.69       335
weighted avg       0.70      0.69      0.69       335

🟦 NN - Validation Confusion Matrix:
[[101  44]
 [ 59 131]]

dt rf knn nb lr ab svm nn