**Train Test and Cross Validation**

In [10]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

print(f"🔍 LR - Original dataset shape: {df.shape}")

df = df.dropna()

print(f"🧹 LR - After dropping NaNs: {df.shape}")

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"📊 LR - Final dataset shape after scaling: {X_scaled.shape}")

lr = LogisticRegression(max_iter=80, random_state=42)

param_grid = {
    'C': [0.01],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(lr, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

grid_search.fit(X_scaled, y)

best_lr = grid_search.best_estimator_

y_pred = best_lr.predict(X_scaled)

print(f"✅ LR - Best Parameters: {grid_search.best_params_}")
print(f"✅ LR - K-Fold Mean Accuracy: {grid_search.best_score_:.4f}")
print("📌 LR Classification Report:")
print(classification_report(y, y_pred))
print("🟦 LR Confusion Matrix:")
print(confusion_matrix(y, y_pred))


🔍 LR - Original dataset shape: (2453, 11)
🧹 LR - After dropping NaNs: (1691, 11)
📊 LR - Final dataset shape after scaling: (1691, 10)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
✅ LR - Best Parameters: {'C': 0.01}
✅ LR - K-Fold Mean Accuracy: 0.6789
📌 LR Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.55      0.61       757
           1       0.69      0.79      0.74       934

    accuracy                           0.69      1691
   macro avg       0.69      0.67      0.67      1691
weighted avg       0.69      0.69      0.68      1691

🟦 LR Confusion Matrix:
[[417 340]
 [192 742]]


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"

df_valid = pd.read_csv(valid_path)
df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"🧪 LR - Validation dataset shape: {X_valid.shape}")

y_pred_valid = best_lr.predict(X_valid_scaled)

print(f"📌 LR - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("🟦 LR - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


🧪 LR - Validation dataset shape: (177, 10)
📌 LR - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73        82
           1       0.75      0.84      0.80        95

    accuracy                           0.77       177
   macro avg       0.77      0.76      0.76       177
weighted avg       0.77      0.77      0.77       177

🟦 LR - Validation Confusion Matrix:
[[56 26]
 [15 80]]
