In [1]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score


In [2]:

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

print(f"SVM - Original dataset shape: {df.shape}")

df = df.dropna()

print(f"SVM - After dropping NaNs: {df.shape}")

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']


SVM - Original dataset shape: (3268, 14)
SVM - After dropping NaNs: (3268, 14)


In [3]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"SVM - Final dataset shape after scaling: {X_scaled.shape}")


SVM - Final dataset shape after scaling: (3268, 13)


In [4]:
svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

svm_kfold_scores = cross_val_score(svm, X_scaled, y, cv=kf, scoring='accuracy')

print(f"SVM - K-Fold Mean Accuracy: {svm_kfold_scores.mean():.4f}")


SVM - K-Fold Mean Accuracy: 0.7423


In [5]:

svm.fit(X_scaled, y)

y_pred = svm.predict(X_scaled)

print("SVM Classification Report on Training Data:")
print(classification_report(y, y_pred))

print("SVM - Confusion Matrix on Training Data:")
print(confusion_matrix(y, y_pred))


SVM Classification Report on Training Data:
              precision    recall  f1-score   support

           0       0.83      0.68      0.75      1638
           1       0.73      0.86      0.79      1630

    accuracy                           0.77      3268
   macro avg       0.78      0.77      0.77      3268
weighted avg       0.78      0.77      0.77      3268

SVM - Confusion Matrix on Training Data:
[[1111  527]
 [ 226 1404]]


In [6]:
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=kf, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_scaled, y)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[CV] END ...............................C=0.0001, kernel=rbf; total time=   0.5s
[CV] END ...............................C=0.0001, kernel=rbf; total time=   0.5s
[CV] END ...............................C=0.0001, kernel=rbf; total time=   0.6s
[CV] END ...............................C=0.0001, kernel=rbf; total time=   0.6s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.3s
[CV] END ...............................C=0.0001, kernel=rbf; total time=   0.6s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.4s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.4s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.4s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.4s
[CV] END ................................C=0.001, kernel=rbf; total time=   0.5s
[CV] END ................................C=0.001, kernel=rbf; total time=   0.5s
[CV] END ...................

In [7]:
valid_path = "/workspaces/Final-Year-Project/Cleaned Data/ValidationData.csv"
df_valid = pd.read_csv(valid_path)

df_valid = df_valid.dropna()

X_valid = df_valid.drop('Diabetes Status', axis=1)
y_valid = df_valid['Diabetes Status']

X_valid_scaled = scaler.transform(X_valid)

print(f"SVM - Validation dataset shape: {X_valid.shape}")


SVM - Validation dataset shape: (364, 13)


In [8]:
y_pred_valid = svm.predict(X_valid_scaled)


In [9]:

best_svm = grid_search.best_estimator_

y_pred_valid = best_svm.predict(X_valid_scaled)

print(f"SVM (Best Params) - Validation Classification Report:")
print(classification_report(y_valid, y_pred_valid))

print("SVM (Best Params) - Validation Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred_valid))



SVM (Best Params) - Validation Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.66      0.73       178
           1       0.73      0.87      0.79       186

    accuracy                           0.76       364
   macro avg       0.77      0.76      0.76       364
weighted avg       0.77      0.76      0.76       364

SVM (Best Params) - Validation Confusion Matrix:
[[117  61]
 [ 25 161]]
