**Train Test and Cross Validation**

In [4]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier


In [5]:
# Load training data
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

# Features and target variable
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


In [6]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [7]:
# Decision Tree Classifier setup
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Hyperparameter grid for GridSearchCV
param_grid = {
    'max_depth': [3, 4, 5, 6, 7],  # Different depths to test
    'min_samples_split': [2, 5, 10],
}

# KFold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV setup with F1-score as the scoring parameter
grid_search = GridSearchCV(dt, param_grid, scoring='f1', cv=kf, n_jobs=-1, verbose=1)


In [8]:
# Fit GridSearchCV on the training data
grid_search.fit(X_scaled, y)

# Best Decision Tree model from GridSearchCV
best_dt = grid_search.best_estimator_


Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [9]:
# Load validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

# Prepare features and target for validation data
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Scale the validation data using the previously fitted scaler
X_valid_scaled = scaler.transform(X_valid)


In [10]:
# Predictions on validation data
y_pred_valid = best_dt.predict(X_valid_scaled)


In [11]:
# Print classification report on validation data
print(f"📌 DT - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Print confusion matrix for validation data
print("🟦 DT - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


📌 DT - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.65      0.71       178
           1       0.71      0.83      0.77       186

    accuracy                           0.74       364
   macro avg       0.75      0.74      0.74       364
weighted avg       0.75      0.74      0.74       364

🟦 DT - Validation Confusion Matrix:
[[115  63]
 [ 31 155]]


In [12]:
# Recalculate Macro average F1-score on validation data
validation_f1_macro = f1_score(y_valid, y_pred_valid, average='macro')
print(f"🏆 Macro F1-Score on Validation Data: {validation_f1_macro}")


🏆 Macro F1-Score on Validation Data: 0.738601637941572
