# Diploma thesis
## Breast cancer classification using machine learning methods
### Feature selection with Minimum Redundancy & Maximum Relevance method

> Lazaros Panitsidis<br />
> Department of Production and Management Engineering <br />
> International Hellenic University <br />
> lazarospanitsidis@outlook.com

## Contents
1. [Useful Python Libraries](#1)
1. [Data Processing](#2)
1. [Gaussian Naive Bayes](#3)
1. [Linear Discriminant Analysis](#4)
1. [Quadratic Discriminant Analysis](#5)
1. [Ridge Classifier](#6)
1. [Decision Tree Classifier](#7)
1. [Random Forest Classifier](#8)
1. [ADA Boost Classifier (Adaptive Boosting)](#9)
1. [C-Support Vector Classification](#10)
1. [Stochastic Gradient Descent Classifier](#11)
1. [eXtreme Gradient Boosting](#12)
1. [Light Gradient Boosting Machine](#13)
1. [K-Nearest Neighbors Classifier](#14)
1. [Multi-layer Perceptron Classifier](#15)
1. [Summary](#16)

<a id='1'></a>
## 1) Useful Python Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import scipy.stats as stats
import matplotlib.pyplot as plt
import time
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

# some of them are not used in this file
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV , LeaveOneOut,KFold,RandomizedSearchCV
from skopt import BayesSearchCV # https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV , https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score , make_scorer , classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.preprocessing import StandardScaler , LabelEncoder
from xgboost import XGBClassifier , plot_importance
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier , RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.neural_network import MLPClassifier
import pygad

<a id='2'></a>
## 2) Data Processing

In [2]:
dataWISC = pd.read_csv('dataWisc.csv')
dataWISC.drop(["id", "Unnamed: 32"], axis = 1, inplace = True)

# Undersampling function
def make_undersample(_df, column):
  dfs_r = {}
  dfs_c = {}
  smaller = 1e1000
  ignore = ""
  for c in _df[column].unique():
    dfs_c[c] = _df[_df[column] == c]
    if dfs_c[c].shape[0] < smaller:
      smaller = dfs_c[c].shape[0]
      ignore = c

  for c in dfs_c:
    if c == ignore:
      continue
    dfs_r[c] = resample(dfs_c[c], 
                        replace=False, # sample without replacement
                        n_samples=smaller,
                        random_state=0)
  return pd.concat([dfs_r[c] for c in dfs_r] + [dfs_c[ignore]])

dataWISC = make_undersample(dataWISC,'diagnosis')

#Description of the dataset

#how many cases are included in the dataset
length = len(dataWISC)
#how many features are in the dataset
features = dataWISC.shape[1]-1 # - diagnosis

# Number of malignant cases
malignant = len(dataWISC[dataWISC['diagnosis']=='M'])

#Number of benign cases
benign = len(dataWISC[dataWISC['diagnosis']=='B'])

#Rate of malignant tumors over all cases
rate = (float(malignant)/(length))*100

print ("There are "+ str(len(dataWISC))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(malignant)+" cases diagnosed as malignant tumor")
print ("There are {}".format(benign)+" cases diagnosed as benign tumor")
print ("The percentage of malignant cases is: {:.2f}%".format(rate))

There are 424 cases in this dataset
There are 30 features in this dataset
There are 212 cases diagnosed as malignant tumor
There are 212 cases diagnosed as benign tumor
The percentage of malignant cases is: 50.00%


In [3]:
y = dataWISC.diagnosis                          # M or B 
x = dataWISC.drop('diagnosis',axis = 1 )
target_names=['Benign','Malignant']
# x_scaled = (x - x.mean())/x.std()
le= LabelEncoder()
le.fit(y)
y_le = le.transform(y)

In [12]:
# MRMR features
x_new = x[['area_mean',
 'fractal_dimension_worst',
 'concavity_worst',
 'area_se',
 'smoothness_worst',
 'texture_mean',
 'concave points_se']]
x_new.head()

Unnamed: 0,area_mean,fractal_dimension_worst,concavity_worst,area_se,smoothness_worst,texture_mean,concave points_se
49,561.0,0.06917,0.2282,20.2,0.1162,22.3,0.01184
285,489.0,0.06431,0.005579,22.45,0.1038,18.4,0.002924
495,680.9,0.06599,0.17,21.84,0.1216,20.21,0.01183
391,234.3,0.08865,0.0,28.85,0.146,16.84,0.0
187,420.3,0.07097,0.1521,17.86,0.1323,17.19,0.01185


In [5]:
# https://machinelearningmastery.com/loocv-for-evaluating-machine-learning-algorithms/#:~:text=Given%20the%20improved%20estimate%20of,biased%20estimates%20of%20model%20performance.
# cv = LeaveOneOut()

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
cv=KFold(n_splits=10, shuffle=True, random_state=13)

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
  originalclass.extend(y_true)
  predictedclass.extend(y_pred)
  #print(classification_report(y_true, y_pred, target_names=target_names)) 
  return accuracy_score(y_true, y_pred)

def print_best_params(grid_search):
    print("")
    print("Best hyperparameters : ", grid_search.best_params_)
    print("")
    print("Best estimator : ", grid_search.best_estimator_)
    print("")

<a id='3'></a>
## 3) [Gaussian Naive Bayes](<https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>)

* Default hyperparameters

In [13]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = Pipeline([('scaler', StandardScaler()), ('gnb', GaussianNB())])
score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.897     0.948     0.922       212
   Malignant      0.945     0.892     0.917       212

    accuracy                          0.920       424
   macro avg      0.921     0.920     0.920       424
weighted avg      0.921     0.920     0.920       424



* Hyperparameter tuning using Grid Search

In [14]:
param_grid = { 'gnb__var_smoothing': np.logspace(0,-10, num=100) }

grid_search = GridSearchCV(clf_gnb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits

Best hyperparameters :  {'gnb__var_smoothing': 0.012045035402587823}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('gnb', GaussianNB(var_smoothing=0.012045035402587823))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
50,0.011221,0.024691,0.001795,0.000399,9e-06,{'gnb__var_smoothing': 8.902150854450392e-06},0.906522,0.905702,0.928847,0.906926,0.976177,0.952273,0.925134,0.830937,0.902778,0.951389,0.918668,0.03749,1
75,0.015259,0.033843,0.009725,0.023026,0.0,{'gnb__var_smoothing': 2.6560877829466893e-08},0.906522,0.905702,0.928847,0.906926,0.976177,0.952273,0.925134,0.830937,0.902778,0.951389,0.918668,0.03749,1
73,0.010771,0.022022,0.001995,0.000447,0.0,{'gnb__var_smoothing': 4.229242874389499e-08},0.906522,0.905702,0.928847,0.906926,0.976177,0.952273,0.925134,0.830937,0.902778,0.951389,0.918668,0.03749,1


* Tuned hyperparameters

In [8]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = Pipeline(steps=[('scaler', StandardScaler()),
                ('gnb', GaussianNB(var_smoothing=0.012045035402587823))])

score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.897     0.948     0.922       212
   Malignant      0.945     0.892     0.917       212

    accuracy                          0.920       424
   macro avg      0.921     0.920     0.920       424
weighted avg      0.921     0.920     0.920       424



<a id='4'></a>
## 4) [Linear Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html>)

* Default hyperparameters

In [15]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = Pipeline([('scaler', StandardScaler()), ('lda', LinearDiscriminantAnalysis())])

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.905     0.986     0.944       212
   Malignant      0.984     0.896     0.938       212

    accuracy                          0.941       424
   macro avg      0.945     0.941     0.941       424
weighted avg      0.945     0.941     0.941       424



* Hyperparameter tuning using Grid Search

In [16]:
param_grid = {
    'lda__solver' : ['svd','lsqr','eigen'],
    'lda__shrinkage':[None,'auto'],
    'lda__tol': [0.0001,0.001,0.01,0.1]
}

grid_search = GridSearchCV(clf_lda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 24 candidates, totalling 240 fits

Best hyperparameters :  {'lda__shrinkage': None, 'lda__solver': 'svd', 'lda__tol': 0.0001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('lda', LinearDiscriminantAnalysis())])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lda__shrinkage,param_lda__solver,param_lda__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004388,0.000489,0.002346,0.000838,,svd,0.0001,"{'lda__shrinkage': None, 'lda__solver': 'svd',...",0.883469,0.952222,0.928847,0.883469,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.940284,0.032898,1
7,0.003892,0.000583,0.001595,0.000797,,lsqr,0.1,"{'lda__shrinkage': None, 'lda__solver': 'lsqr'...",0.883469,0.952222,0.928847,0.883469,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.940284,0.032898,1
11,0.004638,0.001609,0.011769,0.028998,,eigen,0.1,"{'lda__shrinkage': None, 'lda__solver': 'eigen...",0.883469,0.952222,0.928847,0.883469,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.940284,0.032898,1


* Tuned hyperparameters

In [17]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = Pipeline([('scaler', StandardScaler()), ('lda', LinearDiscriminantAnalysis(shrinkage=None,solver='svd',tol=0.0001))])

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.905     0.986     0.944       212
   Malignant      0.984     0.896     0.938       212

    accuracy                          0.941       424
   macro avg      0.945     0.941     0.941       424
weighted avg      0.945     0.941     0.941       424



<a id='5'></a>
## 5) [Quadratic Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html>)

* Default hyperparameters

In [18]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = Pipeline([('scaler', StandardScaler()), ('qda', QuadraticDiscriminantAnalysis())])

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.914     0.953     0.933       212
   Malignant      0.951     0.910     0.930       212

    accuracy                          0.932       424
   macro avg      0.932     0.932     0.932       424
weighted avg      0.932     0.932     0.932       424



* Hyperparameter tuning using Grid Search

In [19]:
param_grid = {
    'qda__reg_param': np.linspace(0, 1, num=10),
    'qda__tol': [0.0001,0.001,0.01]
}

grid_search = GridSearchCV(clf_qda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 30 candidates, totalling 300 fits

Best hyperparameters :  {'qda__reg_param': 0.1111111111111111, 'qda__tol': 0.0001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('qda',
                 QuadraticDiscriminantAnalysis(reg_param=0.1111111111111111))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_qda__reg_param,param_qda__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3,0.00389,0.001218,0.001994,7.15653e-07,0.111111,0.0001,"{'qda__reg_param': 0.1111111111111111, 'qda__t...",0.906522,0.952851,0.928847,0.930081,1.0,0.976068,0.949519,0.879241,0.927545,0.951389,0.940206,0.032367,1
4,0.003291,0.000639,0.001895,0.0002992393,0.111111,0.001,"{'qda__reg_param': 0.1111111111111111, 'qda__t...",0.906522,0.952851,0.928847,0.930081,1.0,0.976068,0.949519,0.879241,0.927545,0.951389,0.940206,0.032367,1
5,0.012119,0.02473,0.002444,0.0004704729,0.111111,0.01,"{'qda__reg_param': 0.1111111111111111, 'qda__t...",0.906522,0.952851,0.928847,0.930081,1.0,0.976068,0.949519,0.879241,0.927545,0.951389,0.940206,0.032367,1


* Tuned hyperparameters

In [20]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = Pipeline([('scaler', StandardScaler()), ('qda', QuadraticDiscriminantAnalysis(reg_param=0.1111111111111111))])

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.916     0.972     0.943       212
   Malignant      0.970     0.910     0.939       212

    accuracy                          0.941       424
   macro avg      0.943     0.941     0.941       424
weighted avg      0.943     0.941     0.941       424



<a id='6'></a>
## 6) [Ridge Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier>)

* Default hyperparameters

In [21]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = Pipeline([('scaler', StandardScaler()), ('rg', RidgeClassifier())])

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.905     0.986     0.944       212
   Malignant      0.984     0.896     0.938       212

    accuracy                          0.941       424
   macro avg      0.945     0.941     0.941       424
weighted avg      0.945     0.941     0.941       424



* Hyperparameter tuning using Grid Search

In [22]:
param_grid = {
    'rg__alpha' : np.linspace(0, 1, num=10),
    'rg__fit_intercept' : [True,False],
    'rg__copy_X' : [True,False],
    'rg__max_iter' : [None],
    'rg__tol' : [0.001],
    'rg__class_weight' : [None,'balanced'],
    'rg__solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'rg__positive' : [False]
}

grid_search = GridSearchCV(clf_rc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 560 candidates, totalling 5600 fits

Best hyperparameters :  {'rg__alpha': 0.0, 'rg__class_weight': None, 'rg__copy_X': True, 'rg__fit_intercept': False, 'rg__max_iter': None, 'rg__positive': False, 'rg__solver': 'svd', 'rg__tol': 0.001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('rg',
                 RidgeClassifier(alpha=0.0, fit_intercept=False,
                                 solver='svd'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rg__alpha,param_rg__class_weight,param_rg__copy_X,param_rg__fit_intercept,param_rg__max_iter,param_rg__positive,param_rg__solver,param_rg__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
558,0.012517,0.024259,0.001895,0.000299,1.0,balanced,False,False,,False,saga,0.001,"{'rg__alpha': 1.0, 'rg__class_weight': 'balanc...",0.883469,0.952222,0.928847,0.906522,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.942589,0.029464,1
287,0.003889,0.000941,0.010224,0.025359,0.555556,,True,False,,False,svd,0.001,"{'rg__alpha': 0.5555555555555556, 'rg__class_w...",0.883469,0.952222,0.928847,0.906522,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.942589,0.029464,1
479,0.005737,0.001566,0.017104,0.030126,0.888889,balanced,True,True,,False,sparse_cg,0.001,"{'rg__alpha': 0.8888888888888888, 'rg__class_w...",0.883469,0.952222,0.928847,0.906522,0.928205,0.976068,0.948148,0.951389,0.97551,0.97551,0.942589,0.029464,1


* Tuned hyperparameters

In [23]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = Pipeline([('scaler', StandardScaler()), 
                    ('rg', RidgeClassifier(alpha=0.0,fit_intercept=False,solver='svd'))])

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.909     0.986     0.946       212
   Malignant      0.985     0.901     0.941       212

    accuracy                          0.943       424
   macro avg      0.947     0.943     0.943       424
weighted avg      0.947     0.943     0.943       424



<a id='7'></a>
## 7) [Decision Tree Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>)

* Default hyperparameters

In [24]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = Pipeline([('scaler', StandardScaler()), ('tree', DecisionTreeClassifier(random_state=13))])

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.927     0.901     0.914       212
   Malignant      0.904     0.929     0.916       212

    accuracy                          0.915       424
   macro avg      0.915     0.915     0.915       424
weighted avg      0.915     0.915     0.915       424



* Hyperparameter tuning using Grid Search

In [25]:
param_grid = {
    'tree__criterion' :['gini','entropy'],
    'tree__splitter' : ['best','random'],
    'tree__max_depth': [2,6,10,None],
    'tree__min_samples_split': list(range(2, 4)),
    'tree__min_samples_leaf': [3,5],
    'tree__min_weight_fraction_leaf' : [0.0],
    'tree__max_features': [None, 'sqrt', 'log2'],
    'tree__max_leaf_nodes' : [None,10,50],
    'tree__min_impurity_decrease' : [0.0],
    'tree__class_weight' : [None,'balanced'],
    'tree__ccp_alpha' : [0.0],
    'tree__random_state' : [13]
}

grid_search = GridSearchCV(clf_tree, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1152 candidates, totalling 11520 fits

Best hyperparameters :  {'tree__ccp_alpha': 0.0, 'tree__class_weight': 'balanced', 'tree__criterion': 'entropy', 'tree__max_depth': None, 'tree__max_features': None, 'tree__max_leaf_nodes': 50, 'tree__min_impurity_decrease': 0.0, 'tree__min_samples_leaf': 3, 'tree__min_samples_split': 2, 'tree__min_weight_fraction_leaf': 0.0, 'tree__random_state': 13, 'tree__splitter': 'random'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('tree',
                 DecisionTreeClassifier(class_weight='balanced',
                                        criterion='entropy', max_leaf_nodes=50,
                                        min_samples_leaf=3, random_state=13,
                                        splitter='random'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tree__ccp_alpha,param_tree__class_weight,param_tree__criterion,param_tree__max_depth,param_tree__max_features,param_tree__max_leaf_nodes,param_tree__min_impurity_decrease,param_tree__min_samples_leaf,param_tree__min_samples_split,param_tree__min_weight_fraction_leaf,param_tree__random_state,param_tree__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1099,0.003458,0.000649,0.001543,0.000469,0.0,balanced,entropy,,,50,0.0,3,3,0.0,13,random,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.928847,0.929624,0.904444,0.928847,0.928531,0.976068,1.0,0.928205,0.926531,0.926531,0.937763,0.026655,1
1097,0.003494,0.000499,0.001753,0.000405,0.0,balanced,entropy,,,50,0.0,3,2,0.0,13,random,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.928847,0.929624,0.904444,0.928847,0.928531,0.976068,1.0,0.928205,0.926531,0.926531,0.937763,0.026655,1
521,0.005144,0.00629,0.002002,1.6e-05,0.0,,entropy,,,50,0.0,3,2,0.0,13,random,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.976282,0.953261,0.904444,0.928847,0.928531,1.0,0.973668,0.879241,0.902778,0.927545,0.93746,0.035936,3


* Tuned hyperparameters

In [26]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = Pipeline(steps=[('scaler', StandardScaler()),
                ('tree',DecisionTreeClassifier(class_weight='balanced',
                                        criterion='entropy', max_leaf_nodes=50,
                                        min_samples_leaf=3, random_state=13,
                                        splitter='random'))])

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.931     0.948     0.939       212
   Malignant      0.947     0.929     0.938       212

    accuracy                          0.939       424
   macro avg      0.939     0.939     0.939       424
weighted avg      0.939     0.939     0.939       424



<a id='8'></a>
## 8) [Random Forest Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>)

* Default hyperparameters

In [46]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=13))])
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.962     0.965       212
   Malignant      0.962     0.967     0.965       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



* Hyperparameter tuning using Grid Search

In [47]:
param_grid = {
    'rf__bootstrap': [True,False],
    'rf__max_depth': [5, 10 , None],
    'rf__n_estimators' : [10,50,100,200,500],
    'rf__max_features': [None, 'sqrt', 'log2'],
    'rf__max_leaf_nodes' : [None,5,10],
    'rf__min_samples_leaf': [1,3,5],
    'rf__min_samples_split': list(range(2, 6)),
    'rf__criterion' :['entropy','gini'],
    'rf__random_state' : [13]
}

grid_search = GridSearchCV(clf_rf, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 6480 candidates, totalling 64800 fits

Best hyperparameters :  {'rf__bootstrap': True, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 500, 'rf__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestClassifier(max_depth=10, max_features='sqrt',
                                        min_samples_split=5, n_estimators=500,
                                        random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__bootstrap,param_rf__criterion,param_rf__max_depth,param_rf__max_features,param_rf__max_leaf_nodes,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,param_rf__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3079,1.515845,1.00359,0.107812,0.091079,True,gini,,log2,,1,5,500,13,"{'rf__bootstrap': True, 'rf__criterion': 'gini...",0.976282,0.953261,0.928847,0.976541,0.952381,0.952273,1.0,0.928205,0.975045,1.0,0.964284,0.024344,1
2899,1.655877,1.113808,0.107007,0.085761,True,gini,,sqrt,,1,5,500,13,"{'rf__bootstrap': True, 'rf__criterion': 'gini...",0.976282,0.953261,0.928847,0.976541,0.952381,0.952273,1.0,0.928205,0.975045,1.0,0.964284,0.024344,1
2539,1.984817,0.955412,0.124336,0.071838,True,gini,10.0,log2,,1,5,500,13,"{'rf__bootstrap': True, 'rf__criterion': 'gini...",0.976282,0.953261,0.928847,0.976541,0.952381,0.952273,1.0,0.928205,0.975045,1.0,0.964284,0.024344,1


* Tuned hyperparameters

In [52]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',RandomForestClassifier(bootstrap=True,criterion='gini',max_depth=None,max_features='log2',max_leaf_nodes=None,
                                        min_samples_leaf=1,min_samples_split=5,n_estimators=500,
                                        random_state=13))])
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.962     0.965       212
   Malignant      0.962     0.967     0.965       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



<a id='9'></a>
## 9) [ADA Boost Classifier (Adaptive Boosting)](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#:~:text=An%20AdaBoost%20%5B1%5D%20classifier%20is,focus%20more%20on%20difficult%20cases.>)

* Default hyperparameters

In [27]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = Pipeline([('scaler', StandardScaler()), ('adab', AdaBoostClassifier(random_state=13))])

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.949     0.958     0.953       212
   Malignant      0.957     0.948     0.953       212

    accuracy                          0.953       424
   macro avg      0.953     0.953     0.953       424
weighted avg      0.953     0.953     0.953       424



* Hyperparameter tuning using Grid Search

In [28]:
param_grid = {
    'adab__base_estimator' : [DecisionTreeClassifier(class_weight='balanced',criterion='entropy', max_leaf_nodes=50,min_samples_leaf=3, random_state=13,splitter='random')],
    'adab__n_estimators' : [10,50,100,500],
    'adab__learning_rate' : np.power(10, np.arange(-3, 1, dtype=float)),
    'adab__algorithm' : ['SAMME', 'SAMME.R'],
    'adab__random_state' : [13],
}

grid_search = GridSearchCV(clf_adaboost, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits

Best hyperparameters :  {'adab__algorithm': 'SAMME.R', 'adab__base_estimator': DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_leaf_nodes=50, min_samples_leaf=3, random_state=13,
                       splitter='random'), 'adab__learning_rate': 0.001, 'adab__n_estimators': 500, 'adab__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('adab',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                          criterion='entropy',
                                                                          max_leaf_nodes=50,
                                                                          min_samples_leaf=3,
                                                                          random_state=13,
                     

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adab__algorithm,param_adab__base_estimator,param_adab__learning_rate,param_adab__n_estimators,param_adab__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
19,1.413741,0.224014,0.105101,0.034889,SAMME.R,DecisionTreeClassifier(class_weight='balanced'...,0.001,500,13,"{'adab__algorithm': 'SAMME.R', 'adab__base_est...",0.976282,0.952851,0.952851,0.976541,0.976177,0.952273,1.0,0.951945,1.0,1.0,0.973892,0.019742,1
15,1.119549,0.219133,0.048162,0.009173,SAMME,DecisionTreeClassifier(class_weight='balanced'...,1.0,500,13,"{'adab__algorithm': 'SAMME', 'adab__base_estim...",0.976282,0.976541,0.952851,0.953261,0.976177,0.952273,0.974437,0.951945,1.0,1.0,0.971377,0.017708,2
21,0.193253,0.11267,0.01025,0.001991,SAMME.R,DecisionTreeClassifier(class_weight='balanced'...,0.01,50,13,"{'adab__algorithm': 'SAMME.R', 'adab__base_est...",0.976282,0.976541,0.928847,0.976541,0.976177,0.952273,1.0,0.928205,1.0,0.97551,0.969038,0.023956,3


* Tuned hyperparameters

In [29]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = Pipeline(steps=[('scaler', StandardScaler()),
                ('adab',AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                                          criterion='entropy',
                                                                          max_leaf_nodes=50,
                                                                          min_samples_leaf=3,
                                                                          random_state=13,
                                                                          splitter='random'),
                                    learning_rate=0.001, n_estimators=500,
                                    random_state=13))])

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.981     0.974       212
   Malignant      0.981     0.967     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



<a id='10'></a>
## 10) [C-Support Vector Classification](<https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>)

* Default hyperparameters

In [30]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.976     0.965       212
   Malignant      0.976     0.953     0.964       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



* Hyperparameter tuning using Grid Search

In [31]:
param_grid = [
    {
        'svc__kernel': ['rbf'], 
        'svc__gamma': [1e-2, 1e-3, 1e-4,'auto','scale'], 
        'svc__C': [1, 10, 100, 1000],
        'svc__decision_function_shape': ['ovo', 'ovr'],
        'svc__random_state' : [13]
    },
    {
        'svc__kernel': ['linear'], 
        'svc__C': [1, 10, 100, 1000],
        'svc__decision_function_shape': ['ovo', 'ovr'],
        'svc__random_state' : [13]
    },
]

grid_search = GridSearchCV(clf_svc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 48 candidates, totalling 480 fits

Best hyperparameters :  {'svc__C': 10, 'svc__decision_function_shape': 'ovo', 'svc__gamma': 0.01, 'svc__kernel': 'rbf', 'svc__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('svc',
                 SVC(C=10, decision_function_shape='ovo', gamma=0.01,
                     random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__decision_function_shape,param_svc__gamma,param_svc__kernel,param_svc__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
15,0.006582,0.001681,0.004388,0.003344,10,ovr,0.01,rbf,13,"{'svc__C': 10, 'svc__decision_function_shape':...",0.976282,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.975848,1.0,0.97551,0.973657,0.012633,1
10,0.010273,0.010667,0.00349,0.003224,10,ovo,0.01,rbf,13,"{'svc__C': 10, 'svc__decision_function_shape':...",0.976282,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.975848,1.0,0.97551,0.973657,0.012633,1
26,0.004787,0.001075,0.002345,0.00045,100,ovr,0.001,rbf,13,"{'svc__C': 100, 'svc__decision_function_shape'...",0.929624,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.975848,1.0,0.97551,0.968991,0.018194,3


* Tuned hyperparameters

In [32]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = Pipeline(steps=[('scaler', StandardScaler()),
                ('svc',SVC(C=10, decision_function_shape='ovo', gamma=0.01,
                     random_state=13))])

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.963     0.986     0.974       212
   Malignant      0.986     0.962     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



<a id='11'></a>
## 11) [Stochastic Gradient Descent Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html>)

* Default hyperparameters

In [33]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_sgd = Pipeline([('scaler', StandardScaler()), ('sgd', SGDClassifier(random_state=13))])

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.966     0.948     0.957       212
   Malignant      0.949     0.967     0.958       212

    accuracy                          0.958       424
   macro avg      0.958     0.958     0.958       424
weighted avg      0.958     0.958     0.958       424



* Hyperparameter tuning using Grid Search

In [34]:
param_grid = {
    'sgd__average': [True, False],
    'sgd__l1_ratio': np.linspace(0, 1, num=10),
    'sgd__alpha': np.power(10, np.arange(-2, 1, dtype=float)),
    'sgd__random_state' : [13]
}

grid_search = GridSearchCV(clf_sgd, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 60 candidates, totalling 600 fits

Best hyperparameters :  {'sgd__alpha': 0.01, 'sgd__average': False, 'sgd__l1_ratio': 0.0, 'sgd__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('sgd',
                 SGDClassifier(alpha=0.01, l1_ratio=0.0, random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__average,param_sgd__l1_ratio,param_sgd__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
14,0.005685,0.001787,0.005087,0.006455,0.01,False,0.444444,13,"{'sgd__alpha': 0.01, 'sgd__average': False, 's...",0.976282,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.951945,1.0,0.97551,0.971266,0.014161,1
11,0.004288,0.000639,0.002493,0.001201,0.01,False,0.111111,13,"{'sgd__alpha': 0.01, 'sgd__average': False, 's...",0.976282,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.951945,1.0,0.97551,0.971266,0.014161,1
19,0.004289,0.001002,0.001995,0.000631,0.01,False,1.0,13,"{'sgd__alpha': 0.01, 'sgd__average': False, 's...",0.976282,0.952851,0.952851,0.976541,0.976177,0.976068,0.974437,0.951945,1.0,0.97551,0.971266,0.014161,1


* Tuned hyperparameters

In [35]:
originalclass = []
predictedclass = []

# Cross validate
clf_sgd = Pipeline(steps=[('scaler', StandardScaler()),
                ('sgd',SGDClassifier(alpha=0.01, l1_ratio=0.0, random_state=13))])

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.963     0.981     0.972       212
   Malignant      0.981     0.962     0.971       212

    accuracy                          0.972       424
   macro avg      0.972     0.972     0.972       424
weighted avg      0.972     0.972     0.972       424



<a id='12'></a>
## 12) [eXtreme Gradient Boosting](<https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters>)

* Default hyperparameters

In [48]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_xgb = Pipeline([('scaler', StandardScaler()), ('xgb', XGBClassifier(random_state=13))])

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.952     0.939     0.945       212
   Malignant      0.940     0.953     0.946       212

    accuracy                          0.946       424
   macro avg      0.946     0.946     0.946       424
weighted avg      0.946     0.946     0.946       424



* Hyperparameter tuning using Grid Search

In [53]:
# https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
# https://www.cs.cornell.edu/courses/cs4780/2018sp/lectures/lecturenote19.html
# https://medium.com/data-design/xgboost-hi-im-gamma-what-can-i-do-for-you-and-the-tuning-of-regularization-a42ea17e6ab6

param_grid = {
        'xgb__booster' : ['gbtree'],
        'xgb__validate_parameters' : [True],
        'xgb__learning_rate' : [0.05,0.1,0.3,0.5,1],
        'xgb__gamma' : [0,0.01,0.1,0.5,1],
        'xgb__max_depth' : [2,6,10],
        'xgb__min_child_weight' : [1,3,5],
        'xgb__max_delta_step' : [0,2,4],
        'xgb__subsample' : [0.5],
        'xgb__colsample_bylevel' : [1],
        'xgb__colsample_bynode' : [1],
        'xgb__colsample_bytree' : [1],
        'xgb__reg_lambda' : [0,1],
        'xgb__reg_alpha' : [0],
        'xgb__tree_method' : ['exact'],
        'xgb__scale_pos_weight' : [1],
        'xgb__objective' : ['binary:logistic'], # 'multi:softmax' -> same scores as 'binary:logistic'
        #'num_class' : [2],
        'xgb__n_estimators' : [50,100,200,500],
        'xgb__random_state' : [13]
    }

grid_search = GridSearchCV(clf_xgb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 5400 candidates, totalling 54000 fits

Best hyperparameters :  {'xgb__booster': 'gbtree', 'xgb__colsample_bylevel': 1, 'xgb__colsample_bynode': 1, 'xgb__colsample_bytree': 1, 'xgb__gamma': 0.01, 'xgb__learning_rate': 0.05, 'xgb__max_delta_step': 0, 'xgb__max_depth': 10, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100, 'xgb__objective': 'binary:logistic', 'xgb__random_state': 13, 'xgb__reg_alpha': 0, 'xgb__reg_lambda': 0, 'xgb__scale_pos_weight': 1, 'xgb__subsample': 0.5, 'xgb__tree_method': 'exact', 'xgb__validate_parameters': True}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__booster,param_xgb__colsample_bylevel,param_xgb__colsample_bynode,param_xgb__colsample_bytree,param_xgb__gamma,param_xgb__learning_rate,param_xgb__max_delta_step,param_xgb__max_depth,param_xgb__min_child_weight,param_xgb__n_estimators,param_xgb__objective,param_xgb__random_state,param_xgb__reg_alpha,param_xgb__reg_lambda,param_xgb__scale_pos_weight,param_xgb__subsample,param_xgb__tree_method,param_xgb__validate_parameters,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1178,0.091209,0.006738,0.003191,0.000399,gbtree,1,1,1,0.01,0.05,2,6,1,100,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",0.976282,0.906522,0.952851,0.952851,0.952381,0.952273,1.0,0.951945,1.0,1.0,0.96451,0.028296,1
1202,0.119441,0.021978,0.004288,0.001948,gbtree,1,1,1,0.01,0.05,2,10,1,100,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",0.976282,0.906522,0.952851,0.952851,0.952381,0.952273,1.0,0.951945,1.0,1.0,0.96451,0.028296,1
1344,0.048571,0.0037,0.002943,0.00035,gbtree,1,1,1,0.01,0.1,0,10,1,50,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",0.976282,0.906522,0.952851,0.952851,0.952381,0.952273,1.0,0.951945,1.0,1.0,0.96451,0.028296,1


* Tuned hyperparameters

In [58]:
originalclass = []
predictedclass = []

# Cross validate
clf_xgb = Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',XGBClassifier(booster='gbtree',gamma=0.01,learning_rate=0.05,max_delta_step=2,max_depth=6,min_child_weight=1,
                                    n_estimators=100,objective='binary:logistic',reg_alpha=0,reg_lambda=0,scale_pos_weight=1,subsample=0.5,
                                    tree_method='exact',validate_parameters=True,random_state=13))])

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.962     0.965       212
   Malignant      0.962     0.967     0.965       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



<a id='13'></a>
## 13) [Light Gradient Boosting Machine](<https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>)

* Default hyperparameters

In [43]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = Pipeline([('scaler', StandardScaler()), ('lgbm', lgbm.LGBMClassifier(random_state=13))])

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.953     0.962     0.958       212
   Malignant      0.962     0.953     0.957       212

    accuracy                          0.958       424
   macro avg      0.958     0.958     0.958       424
weighted avg      0.958     0.958     0.958       424



* Hyperparameter tuning using Grid Search

In [44]:
# https://neptune.ai/blog/lightgbm-parameters-guide
# https://www.youtube.com/watch?v=5CWwwtEM2TA&ab_channel=PyData & https://github.com/MSusik/newgradientboosting/blob/master/pydata.pdf

param_grid = {
        'lgbm__boosting_type' : ['gbdt','dart'],
        'lgbm__num_leaves' : [10,20,30,40,50],
        'lgbm__max_depth' : [3,6,9,-1],
        'lgbm__learning_rate' : [0.05,0.1,0.3,0.5,1],
        'lgbm__n_estimators' : [50,100,200,500],
        'lgbm__objective' : ['binary'],
        'lgbm__min_child_samples' : [10,20,30],
        'lgbm__subsample' : [0.5],
        'lgbm__reg_lambda' : [0,1],
        'lgbm__reg_alpha' : [0],
        'lgbm__subsample' : [0.5],
        'lgbm__colsample_bytree' : [1],
        'lgbm__scale_pos_weight' : [1],
        'lgbm__random_state' : [13]
    }

grid_search = GridSearchCV(clf_lgbm, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 4800 candidates, totalling 48000 fits

Best hyperparameters :  {'lgbm__boosting_type': 'gbdt', 'lgbm__colsample_bytree': 1, 'lgbm__learning_rate': 0.5, 'lgbm__max_depth': 6, 'lgbm__min_child_samples': 20, 'lgbm__n_estimators': 500, 'lgbm__num_leaves': 10, 'lgbm__objective': 'binary', 'lgbm__random_state': 13, 'lgbm__reg_alpha': 0, 'lgbm__reg_lambda': 0, 'lgbm__scale_pos_weight': 1, 'lgbm__subsample': 0.5}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('lgbm',
                 LGBMClassifier(colsample_bytree=1, learning_rate=0.5,
                                max_depth=6, n_estimators=500, num_leaves=10,
                                objective='binary', random_state=13,
                                reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                                subsample=0.5))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbm__boosting_type,param_lgbm__colsample_bytree,param_lgbm__learning_rate,param_lgbm__max_depth,param_lgbm__min_child_samples,param_lgbm__n_estimators,param_lgbm__num_leaves,param_lgbm__objective,param_lgbm__random_state,param_lgbm__reg_alpha,param_lgbm__reg_lambda,param_lgbm__scale_pos_weight,param_lgbm__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1630,0.074002,0.032326,0.00758,0.0122,gbdt,1,0.5,6,20,500,10,binary,13,0,0,1,0.5,"{'lgbm__boosting_type': 'gbdt', 'lgbm__colsamp...",0.976282,0.952851,0.952851,0.952851,0.976177,0.952273,1.0,0.951945,1.0,1.0,0.971523,0.020681,1
1620,0.05246,0.022739,0.004687,0.004928,gbdt,1,0.5,6,20,200,10,binary,13,0,0,1,0.5,"{'lgbm__boosting_type': 'gbdt', 'lgbm__colsamp...",0.976282,0.929624,0.952851,0.952851,0.976177,0.952273,1.0,0.951945,1.0,1.0,0.9692,0.023727,2
1480,0.015758,0.0106,0.001995,0.000631,gbdt,1,0.5,3,20,50,10,binary,13,0,0,1,0.5,"{'lgbm__boosting_type': 'gbdt', 'lgbm__colsamp...",0.976282,0.952851,0.952851,0.928847,0.976177,0.952273,1.0,0.975848,1.0,0.97551,0.969064,0.021282,3


* Tuned hyperparameters

In [45]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = Pipeline(steps=[('scaler', StandardScaler()),
                ('lgbm',lgbm.LGBMClassifier(colsample_bytree=1, learning_rate=0.5,
                                max_depth=6, n_estimators=500, num_leaves=10,
                                objective='binary', random_state=13,
                                reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                                subsample=0.5))])

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.976     0.972       212
   Malignant      0.976     0.967     0.972       212

    accuracy                          0.972       424
   macro avg      0.972     0.972     0.972       424
weighted avg      0.972     0.972     0.972       424



<a id='14'></a>
## 14) [K-Nearest Neighbors Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html>)

* Default hyperparameters

In [36]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.928     0.967     0.947       212
   Malignant      0.966     0.925     0.945       212

    accuracy                          0.946       424
   macro avg      0.947     0.946     0.946       424
weighted avg      0.947     0.946     0.946       424



* Hyperparameter tuning using Grid Search

In [37]:
param_grid = {
    'knn__n_neighbors': list(range(2,10)),
    'knn__weights': ['uniform','distance'],
    'knn__algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10,20,30,40,50],
    'knn__p': [1,2],
    'knn__metric': ['minkowski','manhattan','chebyshev']
}

grid_search = GridSearchCV(clf_knn, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1440 candidates, totalling 14400 fits

Best hyperparameters :  {'knn__algorithm': 'ball_tree', 'knn__leaf_size': 10, 'knn__metric': 'minkowski', 'knn__n_neighbors': 8, 'knn__p': 1, 'knn__weights': 'distance'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=10,
                                      n_neighbors=8, p=1,
                                      weights='distance'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__metric,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
153,0.006249,0.007721,0.002601,0.000663,ball_tree,20,manhattan,8,1,distance,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.952851,0.976282,0.952851,0.953261,0.976177,0.976068,0.949519,0.927545,0.950588,0.951389,0.956653,0.014651,1
633,0.00355,0.00106,0.003101,0.001301,kd_tree,20,manhattan,8,1,distance,"{'knn__algorithm': 'kd_tree', 'knn__leaf_size'...",0.952851,0.976282,0.952851,0.953261,0.976177,0.976068,0.949519,0.927545,0.950588,0.951389,0.956653,0.014651,1
217,0.003899,0.002071,0.003101,0.000951,ball_tree,30,minkowski,8,1,distance,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",0.952851,0.976282,0.952851,0.953261,0.976177,0.976068,0.949519,0.927545,0.950588,0.951389,0.956653,0.014651,1


* Tuned hyperparameters

In [38]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = Pipeline(steps=[('scaler', StandardScaler()),
                ('knn',KNeighborsClassifier(algorithm='ball_tree', leaf_size=10,
                                      n_neighbors=8, p=1,
                                      weights='distance'))])

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.941     0.976     0.958       212
   Malignant      0.975     0.939     0.957       212

    accuracy                          0.958       424
   macro avg      0.958     0.958     0.958       424
weighted avg      0.958     0.958     0.958       424



<a id='15'></a>
## 15) [Multi-layer Perceptron Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html>)

* Default hyperparameters

In [39]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  Pipeline([('scaler', StandardScaler()), ('mlp', MLPClassifier(random_state=13))])

score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.976     0.972       212
   Malignant      0.976     0.967     0.972       212

    accuracy                          0.972       424
   macro avg      0.972     0.972     0.972       424
weighted avg      0.972     0.972     0.972       424



* Hyperparameter tuning using Grid Search

In [40]:
# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
param_grid = {
    'mlp__hidden_layer_sizes' : [(14,28,)],
    'mlp__activation' : ['tanh','relu'],
    'mlp__solver' : ['sgd','adam'],
    'mlp__alpha' : [0.01,0,2],
    'mlp__batch_size' : [40,80,'auto'],
    'mlp__learning_rate' : ['invscaling','adaptive'],
    'mlp__learning_rate_init' : np.power(10, np.arange(-3, 0, dtype=float)),
    'mlp__power_t' : [0.5],
    'mlp__max_iter' : [50,100,200,500],
    'mlp__shuffle' : [True],
    'mlp__random_state' : [13]
}

grid_search = GridSearchCV(clf_mlp, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='rank_test_score').head(3)

Fitting 10 folds for each of 864 candidates, totalling 8640 fits

Best hyperparameters :  {'mlp__activation': 'relu', 'mlp__alpha': 2, 'mlp__batch_size': 'auto', 'mlp__hidden_layer_sizes': (14, 28), 'mlp__learning_rate': 'invscaling', 'mlp__learning_rate_init': 0.01, 'mlp__max_iter': 200, 'mlp__power_t': 0.5, 'mlp__random_state': 13, 'mlp__shuffle': True, 'mlp__solver': 'adam'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()),
                ('mlp',
                 MLPClassifier(alpha=2, hidden_layer_sizes=(14, 28),
                               learning_rate='invscaling',
                               learning_rate_init=0.01, random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__batch_size,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__learning_rate_init,param_mlp__max_iter,param_mlp__power_t,param_mlp__random_state,param_mlp__shuffle,param_mlp__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
831,0.267397,0.042346,0.002094,0.000537,relu,2,auto,"(14, 28)",invscaling,0.01,500,0.5,13,True,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 2, '...",0.976282,0.952851,0.952851,0.976541,1.0,0.952273,0.974437,0.951945,1.0,1.0,0.973718,0.019728,1
829,0.271388,0.05685,0.001993,0.000445,relu,2,auto,"(14, 28)",invscaling,0.01,200,0.5,13,True,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 2, '...",0.976282,0.952851,0.952851,0.976541,1.0,0.952273,0.974437,0.951945,1.0,1.0,0.973718,0.019728,1
855,0.221814,0.037168,0.001995,0.000446,relu,2,auto,"(14, 28)",adaptive,0.01,500,0.5,13,True,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 2, '...",0.976282,0.952851,0.952851,0.976541,1.0,0.952273,0.974437,0.951945,1.0,1.0,0.973718,0.019728,1


* Tuned hyperparameters

In [41]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  Pipeline(steps=[('scaler', StandardScaler()),
                ('mlp',MLPClassifier(alpha=2, hidden_layer_sizes=(14, 28),
                               learning_rate='invscaling',
                               learning_rate_init=0.01, random_state=13))])


score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.976     0.974       212
   Malignant      0.976     0.972     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



* Tried a larger range of hyperparameters for testing at first, but was too time consuming. The worst attempts were then found with the following code and the hyperparameters corresponding to those results were removed.

In [42]:
# print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=True).head(5) # worst 5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__batch_size,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__learning_rate_init,param_mlp__max_iter,param_mlp__power_t,param_mlp__random_state,param_mlp__shuffle,param_mlp__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
384,0.075797,0.019655,0.002194,0.000746,tanh,2.0,auto,"(14, 28)",invscaling,0.001,50,0.5,13,True,sgd,"{'mlp__activation': 'tanh', 'mlp__alpha': 2, '...",0.221719,0.334524,0.349667,0.319062,0.25812,0.276292,0.412051,0.279176,0.368056,0.25812,0.307679,0.055903,853
96,0.064181,0.015236,0.001895,0.000299,tanh,0.01,auto,"(14, 28)",invscaling,0.001,50,0.5,13,True,sgd,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.221719,0.334524,0.349667,0.319062,0.25812,0.276292,0.412051,0.279176,0.368056,0.25812,0.307679,0.055903,853
98,0.113352,0.022165,0.001794,0.000398,tanh,0.01,auto,"(14, 28)",invscaling,0.001,100,0.5,13,True,sgd,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.221719,0.334524,0.349667,0.319062,0.25812,0.276292,0.412051,0.279176,0.368056,0.25812,0.307679,0.055903,853
100,0.161526,0.035263,0.002743,0.000927,tanh,0.01,auto,"(14, 28)",invscaling,0.001,200,0.5,13,True,sgd,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.221719,0.334524,0.349667,0.319062,0.25812,0.276292,0.412051,0.279176,0.368056,0.25812,0.307679,0.055903,853
102,0.198387,0.037169,0.002893,0.000941,tanh,0.01,auto,"(14, 28)",invscaling,0.001,500,0.5,13,True,sgd,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",0.221719,0.334524,0.349667,0.319062,0.25812,0.276292,0.412051,0.279176,0.368056,0.25812,0.307679,0.055903,853


<a id='16'></a>
## 16) Summary

* Below are the tables of the specific feature selection method.
* The performance of the algorithms is in descending order.
* All the results are the average values of a 10-fold cross validation.
* The columns contain the accuracy and the average values of precision, recall and f1 score.
* It is observed that the number of samples of Βenign and Μalignant cancer are equal (212 respectively), so the weighted average and the macro average are equal.

<table>
    <tr>
        <th colspan="5"> MRMR : Default algorithms</th>
    </tr>
    <tr>
        <th></th>
        <th>precision </th>
        <th>recall</th>
        <th>f1 score</th>
        <th>accuracy</th>  
    </tr>
    <tr>
        <th>MLP</th>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
    </tr>
    <tr>
        <th>SVC</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>Random Forest</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>SGD</th>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
    </tr>
    <tr>
        <th>LGBM</th>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
    </tr>
    <tr>
        <th>AdaBoost</th>
        <td>0.953</td>
        <td>0.953</td>
        <td>0.953</td>
        <td>0.953</td>
    </tr>
    <tr>
        <th>KNN</th>
        <td>0.947</td>
        <td>0.946</td>
        <td>0.946</td>
        <td>0.946</td>
    </tr>
    <tr>
        <th>XGBoost</th>
        <td>0.946</td>
        <td>0.946</td>
        <td>0.946</td>
        <td>0.946</td>
    </tr>
    <tr>
        <th>Ridge</th>
        <td>0.945</td>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
    </tr>
    <tr>
        <th>LDA</th>
        <td>0.945</td>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
    </tr>
    <tr>
        <th>QDA</th>
        <td>0.932</td>
        <td>0.932</td>
        <td>0.932</td>
        <td>0.932</td>
    </tr>
    <tr>
        <th>GNB</th>
        <td>0.921</td>
        <td>0.920</td>
        <td>0.920</td>
        <td>0.920</td>
    </tr>
    <tr>
        <th>Decision Tree</th>
        <td>0.915</td>
        <td>0.915</td>
        <td>0.915</td>
        <td>0.915</td>
    </tr>

</table>

<table>
    <tr>
        <th colspan="5"> MRMR : Tuned algorithms</th>
    </tr>
    <tr>
        <th></th>
        <th>precision </th>
        <th>recall</th>
        <th>f1 score</th>
        <th>accuracy</th>  
    </tr>
    <tr>
        <th>MLP</th>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
    </tr>
    <tr>
        <th>SVC</th>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
    </tr>
    <tr>
        <th>AdaBoost</th>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
    </tr>
    <tr>
        <th>LGBM</th>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
    </tr>
    <tr>
        <th>SGD</th>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
    </tr>
    <tr>
        <th>XGBoost</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>Random Forest</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>KNN</th>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
        <td>0.958</td>
    </tr>
    <tr>
        <th>Ridge</th>
        <td>0.947</td>
        <td>0.943</td>
        <td>0.943</td>
        <td>0.943</td>
    </tr>
    <tr>
        <th>LDA</th>
        <td>0.945</td>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
    </tr>
    <tr>
        <th>QDA</th>
        <td>0.943</td>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
    </tr>
    <tr>
        <th>Decision Tree</th>
        <td>0.939</td>
        <td>0.939</td>
        <td>0.939</td>
        <td>0.939</td>
    </tr>
    <tr>
        <th>GNB</th>
        <td>0.921</td>
        <td>0.920</td>
        <td>0.920</td>
        <td>0.920</td>
    </tr>

</table>