**Train Test and Cross Validation**

In [1]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score


In [2]:
# Load training data
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

# Display the original shape of the dataset
print(f"🔍 LR - Original dataset shape: {df.shape}")

# Drop missing values
df = df.dropna()

# Display the shape after dropping NaNs
print(f"🧹 LR - After dropping NaNs: {df.shape}")

# Features and target variable
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


🔍 LR - Original dataset shape: (3268, 14)
🧹 LR - After dropping NaNs: (3268, 14)


In [3]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Display the shape after scaling
print(f"📊 LR - Final dataset shape after scaling: {X_scaled.shape}")


📊 LR - Final dataset shape after scaling: (3268, 13)


In [4]:
# Logistic Regression Classifier setup
lr = LogisticRegression(max_iter=80, random_state=42)

param_grid = {
    #'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'max_iter': [100, 200, 500],  # Maximum iterations
}

# KFold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV setup with accuracy as the scoring parameter
grid_search = GridSearchCV(lr, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)


In [5]:
# Fit GridSearchCV on the training data
grid_search.fit(X_scaled, y)

# Best model from GridSearchCV
best_lr = grid_search.best_estimator_

# Print the best hyperparameters and the best score
print(f"✅ LR - Best Parameters: {grid_search.best_params_}")
print(f"✅ LR - K-Fold Mean Accuracy: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 3 candidates, totalling 15 fits
✅ LR - Best Parameters: {'max_iter': 100}
✅ LR - K-Fold Mean Accuracy: 0.7448


In [6]:
# Predictions on training data
y_pred = best_lr.predict(X_scaled)

# Classification report on the training data
print("📌 LR Classification Report on Training Data:")
print(classification_report(y, y_pred))

# Confusion matrix on the training data
print("🟦 LR - Confusion Matrix on Training Data:")
print(confusion_matrix(y, y_pred))


📌 LR Classification Report on Training Data:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      1638
           1       0.72      0.82      0.77      1630

    accuracy                           0.75      3268
   macro avg       0.76      0.75      0.75      3268
weighted avg       0.76      0.75      0.75      3268

🟦 LR - Confusion Matrix on Training Data:
[[1109  529]
 [ 289 1341]]


In [7]:
# Load validation data
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

# Drop any rows with missing values in the validation set
df_valid = df_valid.dropna()

# Prepare features and target for validation data
X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

# Scale the validation data using the previously fitted scaler
X_valid_scaled = scaler.transform(X_valid)

# Display the shape of the validation dataset
print(f"🧪 LR - Validation dataset shape: {X_valid.shape}")


🧪 LR - Validation dataset shape: (364, 13)


In [8]:
# Predictions on validation data
y_pred_valid = best_lr.predict(X_valid_scaled)


In [9]:
# Print classification report for validation data
print(f"📌 LR - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

# Print confusion matrix for validation data
print("🟦 LR - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


📌 LR - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73       178
           1       0.73      0.83      0.78       186

    accuracy                           0.76       364
   macro avg       0.76      0.75      0.75       364
weighted avg       0.76      0.76      0.75       364

🟦 LR - Validation Confusion Matrix:
[[121  57]
 [ 32 154]]


In [10]:
import pandas as pd
from sklearn.metrics import classification_report

# Assuming you already have y_valid (true labels) and y_pred_valid (predicted labels)
# Example classification report for AdaBoost (AB)
report = classification_report(y_valid, y_pred_valid, output_dict=True)

# Extract weighted scores
weighted_precision = report['weighted avg']['precision']
weighted_recall = report['weighted avg']['recall']
weighted_f1 = report['weighted avg']['f1-score']
accuracy = report['accuracy']

# Create a dictionary with the scores
ab_results = {
    'Model': 'Logistic Regression (LR)',
    'Precision': weighted_precision,
    'Recall': weighted_recall,
    'F1-Score': weighted_f1,
    'Accuracy': accuracy
}

# Convert the dictionary to a DataFrame
ab_df = pd.DataFrame([ab_results])

# Create a DataFrame to store results (if not already created)
results_df = pd.read_csv("/workspaces/Final-Year-Project/Results/model_results.csv")

# Concatenate the new row to the DataFrame
results_df = pd.concat([results_df, ab_df], ignore_index=True)

# Print the DataFrame to confirm it's added
print(results_df)

# Save the DataFrame to a CSV file
results_df.to_csv("/workspaces/Final-Year-Project/Results/model_results.csv", index=False)


                               Model  Precision    Recall  F1-Score  Accuracy
0                      AdaBoost (AB)   0.767212  0.752747  0.748389  0.752747
1                 Neural Network(NN)   0.762241  0.755495  0.753238  0.755495
2                  Decision Tree(DT)   0.748498  0.741758  0.739233  0.741758
3  K Nearest Nearest Neighbours(KNN)   0.723548  0.722527  0.721802  0.722527
4                 Neural Network(NN)   0.775001  0.769231  0.767458  0.769231
5                  Decision Tree(DT)   0.748498  0.741758  0.739233  0.741758
6  K Nearest Nearest Neighbours(KNN)   0.723548  0.722527  0.721802  0.722527
7           Logistic Regression (LR)   0.759684  0.755495  0.753959  0.755495
8           Logistic Regression (LR)   0.759684  0.755495  0.753959  0.755495
9           Logistic Regression (LR)   0.759684  0.755495  0.753959  0.755495


In [11]:
# Recalculate Macro average F1-score on validation data
validation_f1_macro = f1_score(y_valid, y_pred_valid, average='macro')
print(f"🏆 Best Model Validation F1-Score: {validation_f1_macro}")


🏆 Best Model Validation F1-Score: 0.7534682322859513
