In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import (ConfusionMatrixDisplay, auc,
                             balanced_accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support,
                             roc_curve)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import joblib
from tqdm import tqdm

%matplotlib inline
path = '../data/'

parameters = {'kernel': ('linear', 'rbf', 'poly', 'sigmoid'),
              'C': [1, 10, 100, 200, 400],
              'degree': [1, 2, 3, 4, 5],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 'auto','scale'],
              'probability': [True],
              'class_weight': ['balanced', None],
              'tol': [1e-3],}

In [2]:
def fit_model(X, Y, parameters):
  cms = []
  bas = []
  prfs = []

  tprs = []
  aucs = []

  index = 1
  mean_fpr = np.linspace(0, 1, 100)

  svc = SVC(tol=1e-5,probability=True)
  strtfdKFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  model = GridSearchCV(
      svc,
      parameters,
      n_jobs=-1,
      cv=strtfdKFold,
      return_train_score=True,
      verbose=0,
  )

  fig = plt.figure(figsize=[10, 10])

  for train, test in tqdm(strtfdKFold.split(X_train, Y_train),
                          total=strtfdKFold.get_n_splits(),
                          desc="k-fold"):
    # make fit on train data
    model.fit(X_train[train],Y_train[train])
    # predict on test data
    y_pred = model.predict(X_train[test])
    prediction = model.predict_proba(X_train[test])
    # calculate balanced accuracy for fold
    bas.append(balanced_accuracy_score(Y_train[test], y_pred))
    # calculate precision, recall, fscore, support for fold
    prfs.append(precision_recall_fscore_support(Y_train[test], y_pred, average=None))
    # calculate confusion matrix for fold
    cm = confusion_matrix(Y_train[test], y_pred)
    cms.append(ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=le.inverse_transform(np.unique(Y_train))))
    # calculate roc curve
    fpr, tpr, t = roc_curve(Y_train[test], prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    # insert auc in plot
    plt.plot(fpr,
            tpr,
            lw=2,
            alpha=0.3,
            label='ROC fold %d (AUC = %0.2f)' % (index, roc_auc))
    index += 1

  # insert mean roc curve
  plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
  mean_tpr = np.mean(tprs, axis=0)
  mean_auc = auc(mean_fpr, mean_tpr)
  # insert plot of mean roc curve
  plt.plot(mean_fpr,
           mean_tpr,
           color='blue',
           label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),
           lw=2,
           alpha=1)
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC')
  plt.legend(loc="lower right")

  return model, fig, cms, bas, prfs

In [3]:
for R in range(1,4):
  print(f"{'='*100}")
  print(f"{' '*30}Train for LBP with R={R}:")
  print(f"{'='*100}")
  train_npz = np.load(os.path.join(path, 'processed', f'train_lbp_R_{R}.npz'),
                      allow_pickle=True)
  val_npz = np.load(os.path.join(path, 'processed', f'val_lbp_R_{R}.npz'),
                    allow_pickle=True)
  X_train, Y_train = train_npz['X_train'], train_npz['Y_train']
  X_val, Y_val = val_npz['X_val'], val_npz['Y_val']

  le = LabelEncoder()
  Y_train = le.fit_transform(Y_train)
  Y_val = le.fit_transform(Y_val)

  model, fig, cms, bas, prfs = fit_model(X_train, Y_train, parameters)

  fig.savefig(os.path.join('../reports', 'parte2', f'ROC_lbp_R_{R}.png'))
  plt.close(fig)

  y_pred = model.predict(X_val)
  cm = confusion_matrix(Y_val, y_pred)
  cms.append(
      ConfusionMatrixDisplay(confusion_matrix=cm,
                            display_labels=le.inverse_transform(
                                np.unique(Y_val))))

  print(f"{'='*100}")
  print(f"{' '*30}Result for LBP with R={R}:")
  print(f"{'='*100}")
  print(f'Best parameters: {model.best_params_}')
  print()
  print(f"{' '*30}Result for Cross Validation:")
  print(f"Mean balanced accuracy: {np.mean(bas)} +/- {np.std(bas)}")
  print(f"Mean precision: {np.mean(prfs, axis=0)[0]} +/- {np.std(prfs, axis=0)[0]}")
  print(f"Mean recall: {np.mean(prfs, axis=0)[1]} +/- {np.std(prfs, axis=0)[1]}")
  print(f"Mean fscore: {np.mean(prfs, axis=0)[2]} +/- {np.std(prfs, axis=0)[2]}")

  print(f"{' '*30}Report for validation set:")
  print(
      classification_report(Y_val,
                            y_pred,
                            digits=4,
                            target_names=le.inverse_transform(
                                range(len(le.classes_)))))

  index = 1
  fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

  for disp, ax in zip(cms, axes.flat):
    disp.plot(ax=ax, cmap=plt.get_cmap('Blues'))
    if index <= 5:
      ax.set_title(f'Confusion Matrix of Fold {index}')
    else:
      ax.set_title(f'Confusion Matrix of Validation set')
    index += 1

  plt.tight_layout()
  fig.savefig(os.path.join('../reports', 'parte2', f'CM_lbp_R_{R}.png'))
  plt.close(fig)
  joblib.dump(model, os.path.join('../models', f'model_lbp_R_{R}.joblib'))


                              Train for LBP with R=1:


k-fold: 100%|██████████| 5/5 [06:36<00:00, 79.26s/it]


                              Result for LBP with R=1:
Best parameters: {'C': 100, 'class_weight': None, 'degree': 5, 'gamma': 10, 'kernel': 'poly', 'probability': True, 'tol': 1e-05}

                              Result for Cross Validation:
Mean balanced accuracy: 0.9451900351325456 +/- 0.017407359292000667
Mean precision: [0.90013108 0.99357077] +/- [0.0304816  0.00853424]
Mean recall: [0.99354839 0.89683168] +/- [0.00860215 0.03573366]
Mean fscore: [0.94421921 0.94229184] +/- [0.01681306 0.01965035]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.8906    0.9913    0.9383       115
          EM     0.9911    0.8880    0.9367       125

    accuracy                         0.9375       240
   macro avg     0.9408    0.9397    0.9375       240
weighted avg     0.9429    0.9375    0.9375       240

                              Train for LBP with R=2:


k-fold: 100%|██████████| 5/5 [24:11<00:00, 290.29s/it]


                              Result for LBP with R=2:
Best parameters: {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'gamma': 10, 'kernel': 'poly', 'probability': True, 'tol': 1e-05}

                              Result for Cross Validation:
Mean balanced accuracy: 0.9322351162364004 +/- 0.022854827110199708
Mean precision: [0.8796121  0.99085324] +/- [0.03540385 0.00890595]
Mean recall: [0.99142073 0.8730495 ] +/- [0.00804059 0.0416345 ]
Mean fscore: [0.93184617 0.92777888] +/- [0.0216784  0.02582056]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.8769    0.9913    0.9306       115
          EM     0.9909    0.8720    0.9277       125

    accuracy                         0.9292       240
   macro avg     0.9339    0.9317    0.9291       240
weighted avg     0.9363    0.9292    0.9291       240

                              Train for LBP with R=3:


k-fold: 100%|██████████| 5/5 [24:02<00:00, 288.55s/it]


                              Result for LBP with R=3:
Best parameters: {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'gamma': 10, 'kernel': 'poly', 'probability': True, 'tol': 1e-05}

                              Result for Cross Validation:
Mean balanced accuracy: 0.9274351588213042 +/- 0.02236923175432798
Mean precision: [0.86862184 0.99528825] +/- [0.03567833 0.00577668]
Mean recall: [0.9957218  0.85914851] +/- [0.00523982 0.04212322]
Mean fscore: [0.92749135 0.92172899] +/- [0.02128271 0.02531736]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.8571    0.9913    0.9194       115
          EM     0.9907    0.8480    0.9138       125

    accuracy                         0.9167       240
   macro avg     0.9239    0.9197    0.9166       240
weighted avg     0.9267    0.9167    0.9165       240



In [4]:
for R in range(1, 4):
  print(f"{'='*100}")
  print(f"{' '*30}Train for LBP with R={R}:")
  print(f"{'='*100}")
  train_npz = np.load(os.path.join(path, 'processed', f'train_masked_lbp_R_{R}.npz'),
                      allow_pickle=True)
  val_npz = np.load(os.path.join(path, 'processed',
                                 f'val_masked_lbp_R_{R}.npz'),
                    allow_pickle=True)
  X_train, Y_train = train_npz['X_train'], train_npz['Y_train']
  X_val, Y_val = val_npz['X_val'], val_npz['Y_val']

  le = LabelEncoder()
  Y_train = le.fit_transform(Y_train)
  Y_val = le.fit_transform(Y_val)

  model, fig, cms, bas, prfs = fit_model(X_train, Y_train, parameters)

  fig.savefig(os.path.join('../reports', 'parte2', f'ROC_masked_lbp_R_{R}.png'))
  plt.close(fig)

  y_pred = model.predict(X_val)
  cm = confusion_matrix(Y_val, y_pred)
  cms.append(
      ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=le.inverse_transform(
                                 np.unique(Y_val))))

  print(f"{'='*100}")
  print(f"{' '*30}Result for LBP with R={R}:")
  print(f"{'='*100}")
  print(f'Best parameters: {model.best_params_}')
  print()
  print(f"{' '*30}Result for Cross Validation:")
  print(f"Mean balanced accuracy: {np.mean(bas)} +/- {np.std(bas)}")
  print(
      f"Mean precision: {np.mean(prfs, axis=0)[0]} +/- {np.std(prfs, axis=0)[0]}"
  )
  print(
      f"Mean recall: {np.mean(prfs, axis=0)[1]} +/- {np.std(prfs, axis=0)[1]}")
  print(
      f"Mean fscore: {np.mean(prfs, axis=0)[2]} +/- {np.std(prfs, axis=0)[2]}")

  print(f"{' '*30}Report for validation set:")
  print(
      classification_report(Y_val,
                            y_pred,
                            digits=4,
                            target_names=le.inverse_transform(
                                range(len(le.classes_)))))

  index = 1
  fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

  for disp, ax in zip(cms, axes.flat):
    disp.plot(ax=ax, cmap=plt.get_cmap('Blues'))
    if index <= 5:
      ax.set_title(f'Confusion Matrix of Fold {index}')
    else:
      ax.set_title(f'Confusion Matrix of Validation set')
    index += 1

  plt.tight_layout()
  fig.savefig(os.path.join('../reports', 'parte2', f'CM_masked_lbp_R_{R}.png'))
  plt.close(fig)
  joblib.dump(model, os.path.join('../models',
                                  f'model_masked_lbp_R_{R}.joblib'))


                              Train for LBP with R=1:


k-fold: 100%|██████████| 5/5 [10:21<00:00, 124.27s/it]


                              Result for LBP with R=1:
Best parameters: {'C': 1, 'class_weight': 'balanced', 'degree': 1, 'gamma': 0.001, 'kernel': 'linear', 'probability': True, 'tol': 0.001}

                              Result for Cross Validation:
Mean balanced accuracy: 0.5381563183085638 +/- 0.02090545939724406
Mean precision: [0.71178756 0.54039465] +/- [0.10258593 0.01012161]
Mean recall: [0.1179364  0.95837624] +/- [0.03357311 0.01306955]
Mean fscore: [0.20161626 0.69106903] +/- [0.05354782 0.01081599]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.7059    0.1043    0.1818       115
          EM     0.5381    0.9600    0.6897       125

    accuracy                         0.5500       240
   macro avg     0.6220    0.5322    0.4357       240
weighted avg     0.6185    0.5500    0.4463       240

                              Train for LBP with R=2:


k-fold: 100%|██████████| 5/5 [11:23<00:00, 136.64s/it]


                              Result for LBP with R=2:
Best parameters: {'C': 1, 'class_weight': 'balanced', 'degree': 1, 'gamma': 0.001, 'kernel': 'linear', 'probability': True, 'tol': 0.001}

                              Result for Cross Validation:
Mean balanced accuracy: 0.5228683854658629 +/- 0.02420270550595991
Mean precision: [0.66575758 0.53181456] +/- [0.15809239 0.01115874]
Mean recall: [0.08140014 0.96433663] +/- [0.03376802 0.01833586]
Mean fscore: [0.14437356 0.68554181] +/- [0.05627356 0.01353139]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.6923    0.0783    0.1406       115
          EM     0.5330    0.9680    0.6875       125

    accuracy                         0.5417       240
   macro avg     0.6127    0.5231    0.4141       240
weighted avg     0.6094    0.5417    0.4255       240

                              Train for LBP with R=3:


k-fold: 100%|██████████| 5/5 [13:50<00:00, 166.15s/it]


                              Result for LBP with R=3:
Best parameters: {'C': 1, 'class_weight': 'balanced', 'degree': 1, 'gamma': 0.001, 'kernel': 'linear', 'probability': True, 'tol': 0.001}

                              Result for Cross Validation:
Mean balanced accuracy: 0.5241025548677036 +/- 0.028678206256100198
Mean precision: [0.65987762 0.53254266] +/- [0.18071563 0.01379872]
Mean recall: [0.08782887 0.96037624] +/- [0.03744967 0.02340494]
Mean fscore: [0.15437869 0.68514109] +/- [0.06229523 0.01707241]
                              Report for validation set:
              precision    recall  f1-score   support

         AVC     0.7500    0.0783    0.1417       115
          EM     0.5351    0.9760    0.6912       125

    accuracy                         0.5458       240
   macro avg     0.6425    0.5271    0.4165       240
weighted avg     0.6381    0.5458    0.4279       240

