In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.metrics import *
from sklearn.model_selection import (
    train_test_split, cross_val_score,cross_validate, StratifiedKFold)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
import optuna
import warnings
warnings.filterwarnings('ignore')

In [3]:
scaler = StandardScaler()
df = pd.read_csv("Hepatitis_C.csv")
X = df.drop(columns='label').to_numpy()
y = df.iloc[:,-1].to_numpy()
print("Data frame shape:",df.shape,"\nFeatures shape:",X.shape,"\nLabels shape",y.shape)
df.head()

Data frame shape: (204, 13) 
Features shape: (204, 12) 
Labels shape (204,)


Unnamed: 0,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,label
0,32,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,0
1,45,0,41.7,73.2,43.6,29.4,6.4,8.89,5.31,71.0,67.4,70.3,0
2,55,0,41.5,59.5,15.4,16.2,6.8,6.35,5.22,80.0,12.4,69.9,0
3,53,0,37.8,98.1,30.5,21.1,4.0,5.02,4.42,94.0,23.2,65.2,0
4,56,1,39.7,66.0,14.2,20.8,3.5,7.48,5.88,66.0,7.2,67.2,0


In [4]:
X[:,[0,2,3,4,5,6,7,8,9,11]] = scaler.fit_transform(X[:,[0,2,3,4,5,6,7,8,9,11]])

In [5]:
X

array([[-1.49013167e+00,  0.00000000e+00,  3.54355641e-01, ...,
        -7.26193498e-02,  3.38000000e+01,  5.18695961e-01],
       [-2.62474264e-01,  0.00000000e+00,  8.85237553e-02, ...,
        -1.80876191e-01,  6.74000000e+01, -4.51085896e-01],
       [ 6.81877584e-01,  0.00000000e+00,  5.30795039e-02, ...,
        -7.26193498e-02,  1.24000000e+01, -5.22921590e-01],
       ...,
       [ 1.53179425e+00,  1.00000000e+00, -2.16218621e+00, ...,
        -2.32598905e-01,  6.42000000e+01,  1.65010813e+00],
       [-1.68039079e-01,  1.00000000e+00, -1.45330118e+00, ...,
        -4.09418413e-01,  5.00000000e+01, -3.25373434e-01],
       [ 1.05961832e+00,  1.00000000e+00, -9.21637409e-01, ...,
        -2.28990343e-01,  3.40000000e+01, -8.64141132e-01]])

In [8]:
gnb = GaussianNB()
knn = KNeighborsClassifier()
lda = LinearDiscriminantAnalysis()
svm = SVC()

In [9]:
for model in [gnb, knn, lda, svm]:
    params = model.get_params()
    print(f"{model} has {len(params)} tunable parameters:\n {params}\n")

GaussianNB() has 2 tunable parameters:
 {'priors': None, 'var_smoothing': 1e-09}

KNeighborsClassifier() has 8 tunable parameters:
 {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}

LinearDiscriminantAnalysis() has 7 tunable parameters:
 {'covariance_estimator': None, 'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}

SVC() has 15 tunable parameters:
 {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}



In [34]:
class ClassifierTuner(BaseEstimator):
  def __init__(self,classifier_name, outer_folds= 5, innner_folds = 3, shuffle = True, seed = 42, trial_num = 10, verbose = False, model = None):
    self.inner_cv = StratifiedKFold(n_splits = outer_folds, shuffle = shuffle, random_state = seed)
    self.outer_cv = StratifiedKFold(n_splits = innner_folds, shuffle = shuffle, random_state = seed)
    self.shuffle = shuffle
    self.trial_num = trial_num
    self.classifier_name = classifier_name
    self.verbose = verbose
    self.model = model

  def fit(self, X, y):
    baseline_model = GaussianNB()
    parameters = []
    # Initialize arrays to store  avearge scores for each trial of the outer cross validation loop
    # Create empty list to store hyperparameters for each tuned classifier_name 
    #objective function to be optimized by optuna
    def objective(trial):
      # Inner cross validation loop to tune hyperparameterss
      #Knearst neighbors tuning
      if self.classifier_name == "KNeighborsclassifier":
        n_neighbors = trial.suggest_int("n_neighbors", 2, 15)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        p = trial.suggest_int("p", 1, 2)
        self.model = KNeighborsClassifier(n_neighbors = n_neighbors, weights = weights, p = p)

      #Fit classifier_name and predict on the outer test sets after hyperparameter tuning in the inner cross validation loop
      scores = cross_val_score(self.model, X_train, y_train, cv= self.inner_cv , scoring='f1_weighted')
      return scores.mean()

    # Number of of outer cross validation loops
    for i in range(self.trial_num):
      for train_index, test_index in self.outer_cv.split(X, y):
        #Outer cross validation loop to evade overfitting on a single train/test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # inner cross validation loop to tune hyperparameterss
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=50)
        params = study.best_params
        parameters.append(params)

        #Fit classifier_name and predict on the outer test sets after hyperparameter tuning in the inner cross validation loop
        self.model.set_params(**params)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)

        #Fit baseline classifier_name and predict on the outer test sets
        baseline_model.fit(X_train, y_train)
        y_base_pred = baseline_model.predict(X_test)

    #calculate mean scores per outer cross validation trial 
    return parameters
  
  def score(self, y_test, y_pred, average= 'weighted'):
    f1 = f1_score(y_test, y_pred, average = average)
    f2 = fbeta_score(y_test, y_pred, average = average, beta = 2)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average = average)
    precision = precision_score(y_test, y_pred, average = average)
    return np.array(f1, f2, balanced_accuracy, mcc, recall, precision)

  def predict(self, X):
    return self.model.predict(X)

In [35]:
tuner = ClassifierTuner(classifier_name = "KNeighborsclassifier")
tuner.fit(X, y)

[32m[I 2023-04-11 16:15:12,866][0m A new study created in memory with name: no-name-7d8651f4-ad28-4e38-a09b-92d1b97fc20b[0m
[32m[I 2023-04-11 16:15:12,888][0m Trial 0 finished with value: 0.8064728388310657 and parameters: {'n_neighbors': 7, 'weights': 'distance', 'p': 1}. Best is trial 0 with value: 0.8064728388310657.[0m
[32m[I 2023-04-11 16:15:12,924][0m Trial 1 finished with value: 0.8325428894004746 and parameters: {'n_neighbors': 11, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.8325428894004746.[0m
[32m[I 2023-04-11 16:15:12,957][0m Trial 2 finished with value: 0.8080414648990499 and parameters: {'n_neighbors': 7, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.8325428894004746.[0m
[32m[I 2023-04-11 16:15:12,985][0m Trial 3 finished with value: 0.8247821875221255 and parameters: {'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.8325428894004746.[0m
[32m[I 2023-04-11 16:15:13,017][0m Trial 4 finished with

KeyboardInterrupt: 