In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('blood_data.csv')
df.head()

Unnamed: 0,# 1,2,3,4,5,6,7,8,9,10,...,44,45,46,47,48,49,50,51,52,53
0,0.121075,0.912344,1.432111,1.540956,1.252333,0.957594,0.712998,0.798607,1.170393,1.692606,...,2.361702,2.2891,4.362529,1.868435,0.08755,27.734629,0.087037,0.022636,0.810518,6.0
1,0.298066,0.830151,0.960709,1.000123,0.965636,0.819066,0.849858,0.950855,1.154082,1.129449,...,3.327597,7.472283,2.543191,0.130734,0.012256,27.539444,0.127253,0.036147,0.773069,6.0
2,0.196923,1.666643,2.084505,1.855161,1.603003,1.678651,1.546568,0.940188,0.393847,0.157298,...,1.822071,5.098671,3.221859,0.107526,0.01173,24.930386,0.154367,0.054824,0.852725,6.0
3,0.118606,0.361996,1.05016,1.639484,1.858165,1.370149,1.020508,0.775883,0.717815,0.668396,...,1.717295,3.920698,7.626998,0.348129,0.025473,23.268801,0.18843,0.078499,0.830111,6.0
4,0.00917,1.609278,3.320568,2.427671,1.508411,0.986886,0.571958,0.291137,0.153592,0.115767,...,0.96399,2.088302,4.885723,4.324594,0.201431,25.532965,0.141487,0.041315,0.847301,6.0


## Classical Models

### Linear SVM

In [3]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [12]:
# Evaluate model using cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True) # 5 folds
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
precision, recall, f1 = 0, 0, 0
cv_y_test = []
cv_y_pred = []

for train_index, test_index in kf.split(X, y):
    x_train = X.iloc[train_index, :]
    y_train = y[train_index]
    x_test = X.iloc[test_index, :]
    y_test = y[test_index]
    
    clf = make_pipeline(StandardScaler(), SVC(kernel='linear'))
    clf.fit(x_train, y_train)
    
    y_pred = clf.predict(x_test)
    cv_y_test.append(y_test)
    cv_y_pred.append(y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision += report['macro avg']['precision']
    recall += report['macro avg']['recall']
    f1 += report['macro avg']['f1-score'] 

In [13]:
accuracy = (precision + recall + f1)/15
print(f'Accuracy: {accuracy}')

Accuracy: 0.9584489221529925


In [14]:
accuracy_score(np.concatenate(cv_y_test), np.concatenate(cv_y_pred))

0.9584993359893759

In [15]:
print(classification_report(np.concatenate(cv_y_test), np.concatenate(cv_y_pred)))

              precision    recall  f1-score   support

         1.0       0.98      0.98      0.98       512
         2.0       0.95      0.98      0.97       500
         3.0       0.99      0.99      0.99       500
         4.0       0.95      0.93      0.94       500
         5.0       0.94      0.95      0.95       500
         6.0       0.94      0.92      0.93       500

    accuracy                           0.96      3012
   macro avg       0.96      0.96      0.96      3012
weighted avg       0.96      0.96      0.96      3012



### Radial SVM

In [16]:
# Evaluate model using cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True) # 5 folds
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
precision, recall, f1 = 0, 0, 0
cv_y_test = []
cv_y_pred = []

for train_index, test_index in kf.split(X, y):
    x_train = X.iloc[train_index, :]
    y_train = y[train_index]
    x_test = X.iloc[test_index, :]
    y_test = y[test_index]
    
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf"))
    clf.fit(x_train, y_train)
    
    y_pred = clf.predict(x_test)
    cv_y_test.append(y_test)
    cv_y_pred.append(y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision += report['macro avg']['precision']
    recall += report['macro avg']['recall']
    f1 += report['macro avg']['f1-score'] 

In [17]:
accuracy = (precision + recall + f1)/15
print(f'Accuracy: {accuracy}')

Accuracy: 0.968591944795079


In [18]:
accuracy_score(np.concatenate(cv_y_test), np.concatenate(cv_y_pred))

0.9684594953519257

In [19]:
print(classification_report(np.concatenate(cv_y_test), np.concatenate(cv_y_pred)))

              precision    recall  f1-score   support

         1.0       0.97      0.98      0.98       512
         2.0       0.98      0.98      0.98       500
         3.0       1.00      0.99      0.99       500
         4.0       0.98      0.94      0.96       500
         5.0       0.94      0.96      0.95       500
         6.0       0.94      0.96      0.95       500

    accuracy                           0.97      3012
   macro avg       0.97      0.97      0.97      3012
weighted avg       0.97      0.97      0.97      3012



## Multilayer Perceptron

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
parameter_space = {
    'hidden_layer_sizes': [
        (100,), (150,), (200,),  # Single layer
        (100, 100), (150, 150), (200, 200),  # Two layers
    ]
}

In [22]:
# model = make_pipeline(
#     StandardScaler(), 
#     MLPClassifier()
# )

# mlp_gs = GridSearchCV(
#     model,
#     parameter_space,
#     cv=5, # cross validation
#     n_jobs=-1
# )

model = make_pipeline(StandardScaler(), 
    GridSearchCV(
        MLPClassifier(),
        param_grid=parameter_space,
        cv=5,
        n_jobs=-1,
        scoring='accuracy',
        error_score="raise"
    )
)

In [23]:
model.fit(X, y)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=5, error_score='raise',
                              estimator=MLPClassifier(), n_jobs=-1,
                              param_grid={'hidden_layer_sizes': [(100,), (150,),
                                                                 (200,),
                                                                 (100, 100),
                                                                 (150, 150),
                                                                 (200, 200)]},
                              scoring='accuracy'))])

In [24]:
model.named_steps['gridsearchcv'].cv_results_

{'mean_fit_time': array([ 6.99189134, 10.33562341, 11.65983524, 10.7730361 , 17.01300802,
        15.87358489]),
 'std_fit_time': array([0.1601561 , 1.92037031, 1.61904275, 1.92443524, 1.69453558,
        0.80502674]),
 'mean_score_time': array([0.00892572, 0.00745711, 0.00705447, 0.00752578, 0.01272726,
        0.        ]),
 'std_score_time': array([0.00399482, 0.00497319, 0.00075304, 0.00610817, 0.00637961,
        0.        ]),
 'param_hidden_layer_sizes': masked_array(data=[(100,), (150,), (200,), (100, 100), (150, 150),
                    (200, 200)],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': (100,)},
  {'hidden_layer_sizes': (150,)},
  {'hidden_layer_sizes': (200,)},
  {'hidden_layer_sizes': (100, 100)},
  {'hidden_layer_sizes': (150, 150)},
  {'hidden_layer_sizes': (200, 200)}],
 'split0_test_score': array([0.98175788, 0.97844113, 0.97844113, 0.97512438, 0.98175788,
    

In [25]:
model.named_steps['gridsearchcv'].best_params_

{'hidden_layer_sizes': (100,)}

In [26]:
model.named_steps['gridsearchcv'].best_score_

0.9707800972986671