**Train Test and Cross Validation**

In [10]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score


In [11]:
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

print(f"AB - Original dataset shape: {df.shape}")

df = df.dropna()

print(f"AB - After dropping NaNs: {df.shape}")

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


AB - Original dataset shape: (3268, 14)
AB - After dropping NaNs: (3268, 14)


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"AB - Final dataset shape after scaling: {X_scaled.shape}")


B - Final dataset shape after scaling: (3268, 13)


In [13]:
ab = AdaBoostClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150, 200],  
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(ab, param_grid, scoring='f1', cv=kf, n_jobs=-1, verbose=1)


In [14]:
grid_search.fit(X_scaled, y)

best_ab = grid_search.best_estimator_

print(f"AB - Best Parameters: {grid_search.best_params_}")
print(f"K-Fold Mean F1-Score with Best Parameters: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
AB - Best Parameters: {'learning_rate': 0.2, 'n_estimators': 200}
K-Fold Mean F1-Score with Best Parameters: 0.7674


In [15]:
y_pred = best_ab.predict(X_scaled)

print("AB Classification Report on Training Data:")
print(classification_report(y, y_pred))

print("AB - Confusion Matrix on Training Data:")
print(confusion_matrix(y, y_pred))


AB Classification Report on Training Data:
              precision    recall  f1-score   support

           0       0.82      0.64      0.72      1638
           1       0.70      0.86      0.77      1630

    accuracy                           0.75      3268
   macro avg       0.76      0.75      0.74      3268
weighted avg       0.76      0.75      0.74      3268

AB - Confusion Matrix on Training Data:
[[1041  597]
 [ 227 1403]]


In [16]:
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"AB - Validation dataset shape: {X_valid.shape}")


AB - Validation dataset shape: (364, 13)


In [17]:
y_pred_valid = best_ab.predict(X_valid_scaled)


In [18]:
print(f"AB - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("AB - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))


AB - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.62      0.71       178
           1       0.71      0.88      0.78       186

    accuracy                           0.75       364
   macro avg       0.77      0.75      0.75       364
weighted avg       0.77      0.75      0.75       364

AB - Validation Confusion Matrix:
[[111  67]
 [ 23 163]]
