**Train Test and Cross Validation**

In [15]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

rf = RandomForestClassifier(random_state=42, class_weight='balanced')

param_grid = {
    'n_estimators': [700],   
    'max_depth': [4],               
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

grid_search.fit(X, y)

best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X)

print("Best Parameters for Random Forest:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)

print("Random Forest Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters for Random Forest: {'max_depth': 4, 'n_estimators': 700}
Mean Accuracy with Best Parameters: 0.6730595619103038
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.66      0.69      1222
           1       0.69      0.74      0.71      1231

    accuracy                           0.70      2453
   macro avg       0.70      0.70      0.70      2453
weighted avg       0.70      0.70      0.70      2453

Confusion Matrix:
[[812 410]
 [322 909]]


**MAKE SURE TO RE RUN WITH MENTIONED RESULTS TO CHECK IF MODEL OVER FITS - USE EXAMPLES OF THE ABOVE STAGE**

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 RF - Validation dataset shape: {X_valid.shape}")

y_pred_valid = best_rf.predict(X_valid_scaled)

print(f"📌 NN - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("🟦 NN - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 RF - Validation dataset shape: (177, 10)
📌 NN - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.65      0.72        82
           1       0.74      0.86      0.80        95

    accuracy                           0.76       177
   macro avg       0.77      0.75      0.76       177
weighted avg       0.77      0.76      0.76       177

🟦 NN - Validation Confusion Matrix:
[[53 29]
 [13 82]]


Estimators doesnt seem to have change on val data

best is 4 or 5 depth - show less and more than