In [76]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

# Define features and target directly from the dataset
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Set up the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [350],       # Number of trees in the forest
    'max_depth': [9],               # Maximum depth of each tree
}

# Use GridSearchCV with KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

# Fit GridSearchCV on the data
grid_search.fit(X, y)

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate the model with the best parameters
y_pred = best_rf.predict(X)

# Print the best hyperparameters and the mean accuracy from cross-validation
print("Best Parameters for Random Forest:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)

# Classification report and confusion matrix
print("Random Forest Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters for Random Forest: {'max_depth': 9, 'n_estimators': 350}
Mean Accuracy with Best Parameters: 0.680270574971815
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.80      2216
           1       0.79      0.84      0.81      2219

    accuracy                           0.81      4435
   macro avg       0.81      0.81      0.81      4435
weighted avg       0.81      0.81      0.81      4435

Confusion Matrix:
[[1724  492]
 [ 356 1863]]


**MAKE SURE TO RE RUN WITH MENTIONED RESULTS TO CHECK IF MODEL OVER FITS - USE EXAMPLES OF THE ABOVE STAGE**

In [77]:
from sklearn.metrics import classification_report, confusion_matrix

# File path for validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

### 🎯 VALIDATION PHASE ###
# Load validation data
df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

# Define features and target
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Standardize validation data (using the same scaler from training)
X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 RF - Validation dataset shape: {X_valid.shape}")

# Evaluate on validation set
y_pred_valid = best_rf.predict(X_valid_scaled)

# Classification Report
print(f"📌 NN - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Confusion Matrix
print("🟦 NN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       145
           1       0.73      0.82      0.77       190

    accuracy                           0.73       335
   macro avg       0.73      0.71      0.72       335
weighted avg       0.73      0.73      0.72       335

🟦 NN - Validation Confusion Matrix:
[[ 89  56]
 [ 35 155]]


best 200, 12 gets same as 350, 12

🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.62      0.68       145
           1       0.74      0.84      0.79       190

    accuracy                           0.74       335
   macro avg       0.74      0.73      0.73       335
weighted avg       0.74      0.74      0.74       335

350, 9

🟦 NN - Validation Confusion Matrix:
[[ 90  55]
 [ 31 159]]

🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       145
           1       0.73      0.82      0.77       190

    accuracy                           0.73       335
   macro avg       0.73      0.71      0.72       335
weighted avg       0.73      0.73      0.72       335

🟦 NN - Validation Confusion Matrix:
[[ 89  56]
 [ 35 155]]

350 est, depth 4

🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       145
           1       0.74      0.82      0.78       190

    accuracy                           0.73       335
   macro avg       0.73      0.72      0.72       335
weighted avg       0.73      0.73      0.73       335

🟦 NN - Validation Confusion Matrix:
[[ 89  56]
 [ 34 156]]

 joint best - 330, 4

 🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       145
           1       0.74      0.82      0.78       190

    accuracy                           0.73       335
   macro avg       0.73      0.72      0.72       335
weighted avg       0.73      0.73      0.73       335

🟦 NN - Validation Confusion Matrix:
[[ 89  56]
 [ 34 156]]

 370 est, depth 5

 🧪 RF - Validation dataset shape: (335, 9)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.60      0.65       145
           1       0.73      0.82      0.77       190

    accuracy                           0.73       335
   macro avg       0.72      0.71      0.71       335
weighted avg       0.72      0.73      0.72       335

🟦 NN - Validation Confusion Matrix:
[[ 87  58]
 [ 34 156]]