<!-- # Διπλωματική Εργασία
## Ταξινόμηση του καρκίνου του μαστού με μεθόδους μηχανικής μάθησης
### Εξαγωγή χαρακτηριστικών με PCA

> Λάζαρος Πανιτσίδης<br />
> Τμήμα Μηχανικών Παραγωγής και Διοίκησης <br />
> Διεθνές Πανεπιστήμιο της Ελλάδος <br />
> lazarospanitsidis@outlook.com -->

# Diploma thesis
## Breast cancer classification using machine learning methods
### Feature extraction with PCA

> Lazaros Panitsidis<br />
> Department of Production and Management Engineering <br />
> International Hellenic University <br />
> lazarospanitsidis@outlook.com

## Contents
1. [Useful Python Libraries](#1)
1. [Data Processing](#2)
1. [Gaussian Naive Bayes](#3)
1. [Linear Discriminant Analysis](#4)
1. [Quadratic Discriminant Analysis](#5)
1. [Ridge Classifier](#6)
1. [Decision Tree Classifier](#7)
1. [Random Forest Classifier](#8)
1. [ADA Boost Classifier (Adaptive Boosting)](#9)
1. [C-Support Vector Classification](#10)
1. [Stochastic Gradient Descent Classifier](#11)
1. [eXtreme Gradient Boosting](#12)
1. [Light Gradient Boosting Machine](#13)
1. [K-Nearest Neighbors Classifier](#14)
1. [Multi-layer Perceptron Classifier](#15)
1. [Summary](#16)

<a id='1'></a>
## 1) Useful Python Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import scipy.stats as stats
import matplotlib.pyplot as plt
import time
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

# some of them are not used in this file
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV , LeaveOneOut,KFold,RandomizedSearchCV
from skopt import BayesSearchCV # https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV , https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score , make_scorer , classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.preprocessing import StandardScaler , LabelEncoder
from xgboost import XGBClassifier , plot_importance
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier , RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.neural_network import MLPClassifier
import pygad

<a id='2'></a>
## 2) Data Processing

In [2]:
dataWISC = pd.read_csv('dataWisc.csv')
dataWISC.drop(["id", "Unnamed: 32"], axis = 1, inplace = True)

# Undersampling function
def make_undersample(_df, column):
  dfs_r = {}
  dfs_c = {}
  smaller = 1e1000
  ignore = ""
  for c in _df[column].unique():
    dfs_c[c] = _df[_df[column] == c]
    if dfs_c[c].shape[0] < smaller:
      smaller = dfs_c[c].shape[0]
      ignore = c

  for c in dfs_c:
    if c == ignore:
      continue
    dfs_r[c] = resample(dfs_c[c], 
                        replace=False, # sample without replacement
                        n_samples=smaller,
                        random_state=0)
  return pd.concat([dfs_r[c] for c in dfs_r] + [dfs_c[ignore]])

dataWISC = make_undersample(dataWISC,'diagnosis')

#Description of the dataset

#how many cases are included in the dataset
length = len(dataWISC)
#how many features are in the dataset
features = dataWISC.shape[1]-1 # - diagnosis

# Number of malignant cases
malignant = len(dataWISC[dataWISC['diagnosis']=='M'])

#Number of benign cases
benign = len(dataWISC[dataWISC['diagnosis']=='B'])

#Rate of malignant tumors over all cases
rate = (float(malignant)/(length))*100

print ("There are "+ str(len(dataWISC))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(malignant)+" cases diagnosed as malignant tumor")
print ("There are {}".format(benign)+" cases diagnosed as benign tumor")
print ("The percentage of malignant cases is: {:.2f}%".format(rate))

There are 424 cases in this dataset
There are 30 features in this dataset
There are 212 cases diagnosed as malignant tumor
There are 212 cases diagnosed as benign tumor
The percentage of malignant cases is: 50.00%


In [3]:
y = dataWISC.diagnosis                          # M or B 
x = dataWISC.drop('diagnosis',axis = 1 )
target_names=['Benign','Malignant']
le= LabelEncoder()
le.fit(y)
y_le = le.transform(y)

In [4]:
x_new = x
x_new.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
49,13.49,22.3,86.91,561.0,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,0.2338,1.353,1.735,20.2,0.004455,0.01382,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99.0,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917
285,12.58,18.4,79.83,489.0,0.08393,0.04216,0.00186,0.002924,0.1697,0.05855,0.2719,1.35,1.721,22.45,0.006383,0.008008,0.00186,0.002924,0.02571,0.002015,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
495,14.87,20.21,96.12,680.9,0.09587,0.08345,0.06824,0.04951,0.1487,0.05748,0.2323,1.636,1.596,21.84,0.005415,0.01371,0.02153,0.01183,0.01959,0.001812,16.01,28.48,103.9,783.6,0.1216,0.1388,0.17,0.1017,0.2369,0.06599
391,8.734,16.84,55.27,234.3,0.1039,0.07428,0.0,0.0,0.1985,0.07098,0.5169,2.079,3.167,28.85,0.01582,0.01966,0.0,0.0,0.01865,0.006736,10.17,22.8,64.01,317.0,0.146,0.131,0.0,0.0,0.2445,0.08865
187,11.71,17.19,74.68,420.3,0.09774,0.06141,0.03809,0.03239,0.1516,0.06095,0.2451,0.7655,1.742,17.86,0.006905,0.008704,0.01978,0.01185,0.01897,0.001671,13.01,21.39,84.42,521.5,0.1323,0.104,0.1521,0.1099,0.2572,0.07097


In [5]:
# https://machinelearningmastery.com/loocv-for-evaluating-machine-learning-algorithms/#:~:text=Given%20the%20improved%20estimate%20of,biased%20estimates%20of%20model%20performance.
# cv = LeaveOneOut()

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
cv=KFold(n_splits=10, shuffle=True, random_state=13)

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
  originalclass.extend(y_true)
  predictedclass.extend(y_pred)
  #print(classification_report(y_true, y_pred, target_names=target_names)) 
  return accuracy_score(y_true, y_pred)

def print_best_params(grid_search):
    print("")
    print("Best hyperparameters : ", grid_search.best_params_)
    print("")
    print("Best estimator : ", grid_search.best_estimator_)
    print("")

<a id='3'></a>
## 3) [Gaussian Naive Bayes](<https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>)

* Default hyperparameters

In [6]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('gnb', GaussianNB())])
score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.919     0.962     0.940       212
   Malignant      0.960     0.915     0.937       212

    accuracy                          0.939       424
   macro avg      0.940     0.939     0.939       424
weighted avg      0.940     0.939     0.939       424



* Hyperparameter tuning using Grid Search

In [7]:
param_grid = { 'gnb__var_smoothing': np.logspace(0,-10, num=100) }

grid_search = GridSearchCV(clf_gnb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits

Best hyperparameters :  {'gnb__var_smoothing': 0.002364489412645407}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('gnb', GaussianNB(var_smoothing=0.002364489412645407))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
50,0.005385,0.0008,0.002792,0.001074,9e-06,{'gnb__var_smoothing': 8.902150854450392e-06},0.952851,0.976282,0.904444,0.930081,0.976177,0.952273,0.923311,0.855835,0.902778,1.0,0.937403,0.040723,1
75,0.005537,0.002212,0.002094,0.000536,0.0,{'gnb__var_smoothing': 2.6560877829466893e-08},0.952851,0.976282,0.904444,0.930081,0.976177,0.952273,0.923311,0.855835,0.902778,1.0,0.937403,0.040723,1
73,0.004488,0.00092,0.001895,0.000299,0.0,{'gnb__var_smoothing': 4.229242874389499e-08},0.952851,0.976282,0.904444,0.930081,0.976177,0.952273,0.923311,0.855835,0.902778,1.0,0.937403,0.040723,1


* Tuned hyperparameters

In [8]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA(n_components=7)),
                ('gnb', GaussianNB(var_smoothing=0.002364489412645407))])

score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.919     0.962     0.940       212
   Malignant      0.960     0.915     0.937       212

    accuracy                          0.939       424
   macro avg      0.940     0.939     0.939       424
weighted avg      0.940     0.939     0.939       424



<a id='4'></a>
## 4) [Linear Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html>)

* Default hyperparameters

In [9]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('lda', LinearDiscriminantAnalysis())])

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



* Hyperparameter tuning using Grid Search

In [10]:
param_grid = {
    'lda__solver' : ['svd','lsqr','eigen'],
    'lda__shrinkage':[None,'auto'],
    'lda__tol': [0.0001,0.001,0.01,0.1]
}

grid_search = GridSearchCV(clf_lda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 24 candidates, totalling 240 fits

Best hyperparameters :  {'lda__shrinkage': None, 'lda__solver': 'svd', 'lda__tol': 0.0001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('lda', LinearDiscriminantAnalysis())])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lda__shrinkage,param_lda__solver,param_lda__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007979,0.001839,0.00369,0.001841,,svd,0.0001,"{'lda__shrinkage': None, 'lda__solver': 'svd',...",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1
7,0.00768,0.002404,0.002793,0.001397,,lsqr,0.1,"{'lda__shrinkage': None, 'lda__solver': 'lsqr'...",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1
11,0.005785,0.002554,0.005585,0.010462,,eigen,0.1,"{'lda__shrinkage': None, 'lda__solver': 'eigen...",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1


* Tuned hyperparameters

In [11]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('lda', LinearDiscriminantAnalysis(shrinkage=None,solver='svd',tol=0.0001))])

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



<a id='5'></a>
## 5) [Quadratic Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html>)

* Default hyperparameters

In [12]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('qda', QuadraticDiscriminantAnalysis())])

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.981     0.967       212
   Malignant      0.981     0.953     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



* Hyperparameter tuning using Grid Search

In [13]:
param_grid = {
    'qda__reg_param': np.linspace(0, 1, num=10),
    'qda__tol': [0.0001,0.001,0.01]
}

grid_search = GridSearchCV(clf_qda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 30 candidates, totalling 300 fits

Best hyperparameters :  {'qda__reg_param': 0.5555555555555556, 'qda__tol': 0.0001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('qda',
                 QuadraticDiscriminantAnalysis(reg_param=0.5555555555555556))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_qda__reg_param,param_qda__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
15,0.005196,0.00264,0.002204,0.000394,0.555556,0.0001,"{'qda__reg_param': 0.5555555555555556, 'qda__t...",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1
17,0.0058,0.003857,0.0026,0.001121,0.555556,0.01,"{'qda__reg_param': 0.5555555555555556, 'qda__t...",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1
16,0.004195,0.000401,0.002402,0.000799,0.555556,0.001,"{'qda__reg_param': 0.5555555555555556, 'qda__t...",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1


* Tuned hyperparameters

In [14]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('qda', QuadraticDiscriminantAnalysis(reg_param=0.5555555555555556,tol=0.0001))])

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.950     0.986     0.968       212
   Malignant      0.985     0.948     0.966       212

    accuracy                          0.967       424
   macro avg      0.968     0.967     0.967       424
weighted avg      0.968     0.967     0.967       424



<a id='6'></a>
## 6) [Ridge Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier>)

* Default hyperparameters

In [15]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('rg', RidgeClassifier())])

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



* Hyperparameter tuning using Grid Search

In [17]:
param_grid = {
    'rg__alpha' : np.linspace(0, 1, num=10),
    'rg__fit_intercept' : [True,False],
    'rg__copy_X' : [True,False],
    'rg__max_iter' : [None],
    'rg__tol' : [0.001],
    'rg__class_weight' : [None,'balanced'],
    'rg__solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'rg__positive' : [False]
}

grid_search = GridSearchCV(clf_rc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 560 candidates, totalling 5600 fits

Best hyperparameters :  {'rg__alpha': 0.0, 'rg__class_weight': None, 'rg__copy_X': True, 'rg__fit_intercept': False, 'rg__max_iter': None, 'rg__positive': False, 'rg__solver': 'svd', 'rg__tol': 0.001}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('rg',
                 RidgeClassifier(alpha=0.0, fit_intercept=False,
                                 solver='svd'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rg__alpha,param_rg__class_weight,param_rg__copy_X,param_rg__fit_intercept,param_rg__max_iter,param_rg__positive,param_rg__solver,param_rg__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
558,0.005541,0.000787,0.003247,0.002704,1.0,balanced,False,False,,False,saga,0.001,"{'rg__alpha': 1.0, 'rg__class_weight': 'balanc...",0.906522,0.976282,0.928847,0.906522,0.976177,0.976068,0.973668,0.951389,0.97551,0.97551,0.95465,0.028162,1
445,0.014708,0.007209,0.004202,0.002084,0.777778,balanced,False,False,,False,sag,0.001,"{'rg__alpha': 0.7777777777777777, 'rg__class_w...",0.906522,0.976282,0.928847,0.906522,0.976177,0.976068,0.973668,0.951389,0.97551,0.97551,0.95465,0.028162,1
233,0.005098,0.000295,0.002151,0.000707,0.444444,,True,False,,False,lsqr,0.001,"{'rg__alpha': 0.4444444444444444, 'rg__class_w...",0.906522,0.976282,0.928847,0.906522,0.976177,0.976068,0.973668,0.951389,0.97551,0.97551,0.95465,0.028162,1


* Tuned hyperparameters

In [18]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('rg',RidgeClassifier(alpha=0.0, fit_intercept=False,
                                 solver='svd'))])

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



<a id='7'></a>
## 7) [Decision Tree Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>)

* Default hyperparameters

In [19]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('tree', DecisionTreeClassifier(random_state=13))])

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.947     0.934     0.941       212
   Malignant      0.935     0.948     0.941       212

    accuracy                          0.941       424
   macro avg      0.941     0.941     0.941       424
weighted avg      0.941     0.941     0.941       424



* Hyperparameter tuning using Grid Search

In [20]:
param_grid = {
    'tree__criterion' :['gini','entropy'],
    'tree__splitter' : ['best','random'],
    'tree__max_depth': [2,6,10,None],
    'tree__min_samples_split': list(range(2, 4)),
    'tree__min_samples_leaf': [3,5],
    'tree__min_weight_fraction_leaf' : [0.0],
    'tree__max_features': [None, 'sqrt', 'log2'],
    'tree__max_leaf_nodes' : [None,10,50],
    'tree__min_impurity_decrease' : [0.0],
    'tree__class_weight' : [None,'balanced'],
    'tree__ccp_alpha' : [0.0],
    'tree__random_state' : [13]
}

grid_search = GridSearchCV(clf_tree, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1152 candidates, totalling 11520 fits

Best hyperparameters :  {'tree__ccp_alpha': 0.0, 'tree__class_weight': None, 'tree__criterion': 'gini', 'tree__max_depth': 6, 'tree__max_features': None, 'tree__max_leaf_nodes': 10, 'tree__min_impurity_decrease': 0.0, 'tree__min_samples_leaf': 5, 'tree__min_samples_split': 2, 'tree__min_weight_fraction_leaf': 0.0, 'tree__random_state': 13, 'tree__splitter': 'best'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('tree',
                 DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10,
                                        min_samples_leaf=5, random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tree__ccp_alpha,param_tree__class_weight,param_tree__criterion,param_tree__max_depth,param_tree__max_features,param_tree__max_leaf_nodes,param_tree__min_impurity_decrease,param_tree__min_samples_leaf,param_tree__min_samples_split,param_tree__min_weight_fraction_leaf,param_tree__random_state,param_tree__splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
662,0.005448,0.000651,0.002248,0.000601,0.0,balanced,gini,6.0,,10,0.0,5,3,0.0,13,best,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.976282,0.952851,0.882706,0.953261,0.976177,0.904762,0.974437,0.926531,1.0,0.950588,0.949759,0.034084,1
660,0.006101,0.002667,0.0019,0.000301,0.0,balanced,gini,6.0,,10,0.0,5,2,0.0,13,best,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.976282,0.952851,0.882706,0.953261,0.976177,0.904762,0.974437,0.926531,1.0,0.950588,0.949759,0.034084,1
806,0.005252,0.000416,0.00205,0.000566,0.0,balanced,gini,,,10,0.0,5,3,0.0,13,best,"{'tree__ccp_alpha': 0.0, 'tree__class_weight':...",0.976282,0.952851,0.882706,0.953261,0.976177,0.904762,0.974437,0.926531,1.0,0.950588,0.949759,0.034084,1


* Tuned hyperparameters

In [21]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('tree',DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10,
                                        min_samples_leaf=5, random_state=13))])

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.948     0.953     0.951       212
   Malignant      0.953     0.948     0.950       212

    accuracy                          0.950       424
   macro avg      0.950     0.950     0.950       424
weighted avg      0.950     0.950     0.950       424



<a id='8'></a>
## 8) [Random Forest Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>)

* Default hyperparameters

In [22]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('rf', RandomForestClassifier(random_state=13))])
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.958     0.972     0.965       212
   Malignant      0.971     0.958     0.964       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



* Hyperparameter tuning using Grid Search

In [43]:
param_grid = {
    'rf__bootstrap': [True,False],
    'rf__max_depth': [5, 10 , None],
    'rf__n_estimators' : [10,50,100,200,500],
    'rf__max_features': [None, 'sqrt', 'log2'],
    'rf__max_leaf_nodes' : [None,5,10],
    'rf__min_samples_leaf': [1,3,5],
    'rf__min_samples_split': list(range(2, 6)),
    'rf__criterion' :['entropy','gini'],
    'rf__random_state' : [13]
}

grid_search = GridSearchCV(clf_rf, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 6480 candidates, totalling 64800 fits

Best hyperparameters :  {'rf__bootstrap': True, 'rf__criterion': 'entropy', 'rf__max_depth': 10, 'rf__max_features': None, 'rf__max_leaf_nodes': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200, 'rf__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('rf',
                 RandomForestClassifier(criterion='entropy', max_depth=10,
                                        max_features=None, n_estimators=200,
                                        random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__bootstrap,param_rf__criterion,param_rf__max_depth,param_rf__max_features,param_rf__max_leaf_nodes,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,param_rf__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
558,0.361533,0.022356,0.021443,0.005357,True,entropy,10.0,,,1,5,200,13,"{'rf__bootstrap': True, 'rf__criterion': 'entr...",1.0,0.976282,0.952851,0.953261,0.976177,0.952273,0.974437,0.927545,0.97551,0.975045,0.966338,0.019,1
1093,0.381779,0.019271,0.021143,0.00321,True,entropy,,,,1,4,200,13,"{'rf__bootstrap': True, 'rf__criterion': 'entr...",1.0,0.976282,0.952851,0.953261,0.976177,0.952273,0.974437,0.927545,0.97551,0.975045,0.966338,0.019,1
1098,0.366021,0.027779,0.019448,0.002241,True,entropy,,,,1,5,200,13,"{'rf__bootstrap': True, 'rf__criterion': 'entr...",1.0,0.976282,0.952851,0.953261,0.976177,0.952273,0.974437,0.927545,0.97551,0.975045,0.966338,0.019,1


* Tuned hyperparameters

In [44]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('rf', RandomForestClassifier(criterion='entropy', max_depth=10,
                                        max_features=None, n_estimators=200,
                                        random_state=13))])
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.963     0.972     0.967       212
   Malignant      0.971     0.962     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



<a id='9'></a>
## 9) [ADA Boost Classifier (Adaptive Boosting)](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#:~:text=An%20AdaBoost%20%5B1%5D%20classifier%20is,focus%20more%20on%20difficult%20cases.>)

* Default hyperparameters

In [34]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('adab', AdaBoostClassifier(random_state=13))])

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.958     0.962       212
   Malignant      0.958     0.967     0.962       212

    accuracy                          0.962       424
   macro avg      0.962     0.962     0.962       424
weighted avg      0.962     0.962     0.962       424



* Hyperparameter tuning using Grid Search

In [35]:
param_grid = {
    'adab__base_estimator' : [DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10,min_samples_leaf=5, random_state=13)],
    'adab__n_estimators' : [10,50,100,500],
    'adab__learning_rate' : np.power(10, np.arange(-3, 1, dtype=float)),
    'adab__algorithm' : ['SAMME', 'SAMME.R'],
    'adab__random_state' : [13],
}

grid_search = GridSearchCV(clf_adaboost, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits

Best hyperparameters :  {'adab__algorithm': 'SAMME', 'adab__base_estimator': DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10, min_samples_leaf=5,
                       random_state=13), 'adab__learning_rate': 1.0, 'adab__n_estimators': 100, 'adab__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('adab',
                 AdaBoostClassifier(algorithm='SAMME',
                                    base_estimator=DecisionTreeClassifier(max_depth=6,
                                                                          max_leaf_nodes=10,
                                                                          min_samples_leaf=5,
                                                                          random_state=13),
                                    n_estimators=100, random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adab__algorithm,param_adab__base_estimator,param_adab__learning_rate,param_adab__n_estimators,param_adab__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
14,0.192684,0.006864,0.008976,0.000997,SAMME,"DecisionTreeClassifier(max_depth=6, max_leaf_n...",1.0,100,13,"{'adab__algorithm': 'SAMME', 'adab__base_estim...",0.976282,0.976282,0.952851,0.953261,1.0,0.952273,0.974437,0.975848,1.0,1.0,0.976123,0.018293,1
15,0.947964,0.040145,0.039694,0.003051,SAMME,"DecisionTreeClassifier(max_depth=6, max_leaf_n...",1.0,500,13,"{'adab__algorithm': 'SAMME', 'adab__base_estim...",0.976282,0.976282,0.952851,0.953261,1.0,0.952273,0.949519,0.975848,1.0,1.0,0.973632,0.019973,2
13,0.102825,0.006331,0.006782,0.004083,SAMME,"DecisionTreeClassifier(max_depth=6, max_leaf_n...",1.0,50,13,"{'adab__algorithm': 'SAMME', 'adab__base_estim...",0.976282,0.976282,0.952851,0.930081,1.0,0.952273,0.974437,0.975848,0.97551,1.0,0.971357,0.020363,3


* Tuned hyperparameters

In [36]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('adab',AdaBoostClassifier(algorithm='SAMME',
                                    base_estimator=DecisionTreeClassifier(max_depth=6,
                                                                          max_leaf_nodes=10,
                                                                          min_samples_leaf=5,
                                                                          random_state=13),
                                    n_estimators=100, random_state=13))])

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.968     0.986     0.977       212
   Malignant      0.986     0.967     0.976       212

    accuracy                          0.976       424
   macro avg      0.977     0.976     0.976       424
weighted avg      0.977     0.976     0.976       424



<a id='10'></a>
## 10) [C-Support Vector Classification](<https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>)

* Default hyperparameters

In [23]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('svc', SVC())])

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.976     0.965       212
   Malignant      0.976     0.953     0.964       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



* Hyperparameter tuning using Grid Search

In [24]:
param_grid = [
    {
        'svc__kernel': ['rbf'], 
        'svc__gamma': [1e-2, 1e-3, 1e-4,'auto','scale'], 
        'svc__C': [1, 10, 100, 1000],
        'svc__decision_function_shape': ['ovo', 'ovr'],
        'svc__random_state' : [13]
    },
    {
        'svc__kernel': ['linear'], 
        'svc__C': [1, 10, 100, 1000],
        'svc__decision_function_shape': ['ovo', 'ovr'],
        'svc__random_state' : [13]
    },
]

grid_search = GridSearchCV(clf_svc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 48 candidates, totalling 480 fits

Best hyperparameters :  {'svc__C': 10, 'svc__decision_function_shape': 'ovo', 'svc__gamma': 'scale', 'svc__kernel': 'rbf', 'svc__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('svc',
                 SVC(C=10, decision_function_shape='ovo', random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__decision_function_shape,param_svc__gamma,param_svc__kernel,param_svc__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
19,0.011094,0.008218,0.003203,0.00105,10,ovr,scale,rbf,13,"{'svc__C': 10, 'svc__decision_function_shape':...",0.976282,1.0,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.97603,0.023858,1
14,0.006284,0.001997,0.002393,0.000488,10,ovo,scale,rbf,13,"{'svc__C': 10, 'svc__decision_function_shape':...",0.976282,1.0,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.97603,0.023858,1
20,0.013904,0.008783,0.003294,0.001007,100,ovo,0.01,rbf,13,"{'svc__C': 100, 'svc__decision_function_shape'...",0.976282,0.952851,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.971315,0.023308,3


* Tuned hyperparameters

In [25]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA(n_components=7)),
                ('svc',SVC(C=10, decision_function_shape='ovo',random_state=13))])

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.976     0.976     0.976       212
   Malignant      0.976     0.976     0.976       212

    accuracy                          0.976       424
   macro avg      0.976     0.976     0.976       424
weighted avg      0.976     0.976     0.976       424



<a id='11'></a>
## 11) [Stochastic Gradient Descent Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html>)

* Default hyperparameters

In [26]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_sgd = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('sgd', SGDClassifier(random_state=13))])

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.953     0.948     0.950       212
   Malignant      0.948     0.953     0.951       212

    accuracy                          0.950       424
   macro avg      0.950     0.950     0.950       424
weighted avg      0.950     0.950     0.950       424



* Hyperparameter tuning using Grid Search

In [27]:
param_grid = {
    'sgd__average': [True, False],
    'sgd__l1_ratio': np.linspace(0, 1, num=10),
    'sgd__alpha': np.power(10, np.arange(-2, 1, dtype=float)),
    'sgd__random_state' : [13]
}

grid_search = GridSearchCV(clf_sgd, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 60 candidates, totalling 600 fits

Best hyperparameters :  {'sgd__alpha': 0.01, 'sgd__average': True, 'sgd__l1_ratio': 0.0, 'sgd__random_state': 13}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('sgd',
                 SGDClassifier(alpha=0.01, average=True, l1_ratio=0.0,
                               random_state=13))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__average,param_sgd__l1_ratio,param_sgd__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006781,0.001397,0.002694,0.00078,0.01,True,0.0,13,"{'sgd__alpha': 0.01, 'sgd__average': True, 'sg...",0.952851,0.976282,0.929624,0.952851,0.976177,0.952273,1.0,0.975848,1.0,0.975045,0.969095,0.021123,1
2,0.005585,0.000798,0.002294,0.000458,0.01,True,0.222222,13,"{'sgd__alpha': 0.01, 'sgd__average': True, 'sg...",0.952851,0.976282,0.929624,0.952851,0.976177,0.952273,1.0,0.975848,1.0,0.975045,0.969095,0.021123,1
3,0.007979,0.002992,0.006083,0.00835,0.01,True,0.333333,13,"{'sgd__alpha': 0.01, 'sgd__average': True, 'sg...",0.952851,0.976282,0.929624,0.952851,0.976177,0.952273,1.0,0.975848,1.0,0.975045,0.969095,0.021123,1


* Tuned hyperparameters

In [28]:
originalclass = []
predictedclass = []

# Cross validate
clf_sgd = Pipeline(steps=[('scaler', StandardScaler()),('pca',PCA(n_components=7)),
                ('sgd',SGDClassifier(alpha=0.01, average=True , l1_ratio=0.0, random_state=13))])

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.963     0.976     0.970       212
   Malignant      0.976     0.962     0.969       212

    accuracy                          0.969       424
   macro avg      0.969     0.969     0.969       424
weighted avg      0.969     0.969     0.969       424



<a id='12'></a>
## 12) [eXtreme Gradient Boosting](<https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters>)

* Default hyperparameters

In [37]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_xgb = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('xgb', XGBClassifier(random_state=13))])

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.963     0.976     0.970       212
   Malignant      0.976     0.962     0.969       212

    accuracy                          0.969       424
   macro avg      0.969     0.969     0.969       424
weighted avg      0.969     0.969     0.969       424



* Hyperparameter tuning using Grid Search

In [38]:
# https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
# https://www.cs.cornell.edu/courses/cs4780/2018sp/lectures/lecturenote19.html
# https://medium.com/data-design/xgboost-hi-im-gamma-what-can-i-do-for-you-and-the-tuning-of-regularization-a42ea17e6ab6

param_grid = {
        'xgb__booster' : ['gbtree'],
        'xgb__validate_parameters' : [True],
        'xgb__learning_rate' : [0.05,0.1,0.3,0.5,1],
        'xgb__gamma' : [0,0.01,0.1,0.5,1],
        'xgb__max_depth' : [2,6,10],
        'xgb__min_child_weight' : [1,3,5],
        'xgb__max_delta_step' : [0,2,4],
        'xgb__subsample' : [0.5],
        'xgb__colsample_bylevel' : [1],
        'xgb__colsample_bynode' : [1],
        'xgb__colsample_bytree' : [1],
        'xgb__reg_lambda' : [0,1],
        'xgb__reg_alpha' : [0],
        'xgb__tree_method' : ['exact'],
        'xgb__scale_pos_weight' : [1],
        'xgb__objective' : ['binary:logistic'], # 'multi:softmax' -> same scores as 'binary:logistic'
        #'num_class' : [2],
        'xgb__n_estimators' : [50,100,200,500],
        'xgb__random_state' : [13]
    }

grid_search = GridSearchCV(clf_xgb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 5400 candidates, totalling 54000 fits

Best hyperparameters :  {'xgb__booster': 'gbtree', 'xgb__colsample_bylevel': 1, 'xgb__colsample_bynode': 1, 'xgb__colsample_bytree': 1, 'xgb__gamma': 0, 'xgb__learning_rate': 0.5, 'xgb__max_delta_step': 2, 'xgb__max_depth': 10, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 50, 'xgb__objective': 'binary:logistic', 'xgb__random_state': 13, 'xgb__reg_alpha': 0, 'xgb__reg_lambda': 0, 'xgb__scale_pos_weight': 1, 'xgb__subsample': 0.5, 'xgb__tree_method': 'exact', 'xgb__validate_parameters': True}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
      

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__booster,param_xgb__colsample_bylevel,param_xgb__colsample_bynode,param_xgb__colsample_bytree,param_xgb__gamma,param_xgb__learning_rate,param_xgb__max_delta_step,param_xgb__max_depth,param_xgb__min_child_weight,param_xgb__n_estimators,param_xgb__objective,param_xgb__random_state,param_xgb__reg_alpha,param_xgb__reg_lambda,param_xgb__scale_pos_weight,param_xgb__subsample,param_xgb__tree_method,param_xgb__validate_parameters,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
770,0.109208,0.030242,0.007679,0.00858,gbtree,1,1,1,0,0.5,2,10,1,100,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",1.0,1.0,0.952851,0.953261,1.0,0.928531,0.974437,0.975848,0.97551,1.0,0.976044,0.023692,1
768,0.072606,0.02765,0.004587,0.001425,gbtree,1,1,1,0,0.5,2,10,1,50,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",1.0,1.0,0.952851,0.953261,1.0,0.928531,0.974437,0.975848,0.97551,1.0,0.976044,0.023692,1
4444,0.152592,0.009366,0.003989,0.001727,gbtree,1,1,1,1,0.05,2,10,1,200,binary:logistic,13,0,0,1,0.5,exact,True,"{'xgb__booster': 'gbtree', 'xgb__colsample_byl...",1.0,0.976282,0.929624,0.953261,1.0,0.952273,0.974437,0.975848,1.0,0.975045,0.973677,0.022155,3


* Tuned hyperparameters

In [42]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_xgb = Pipeline(steps=[('scaler', StandardScaler()), ('pca',PCA(n_components=7)),
                ('xgb',XGBClassifier(booster='gbtree',gamma=0,learning_rate=0.5,max_delta_step=2,max_depth=10,min_child_weight=1,
                                    n_estimators=100,objective='binary:logistic',reg_alpha=0,reg_lambda=0,scale_pos_weight=1,subsample=0.5,
                                    tree_method='exact',validate_parameters=True,random_state=13))])

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.981     0.977       212
   Malignant      0.981     0.972     0.976       212

    accuracy                          0.976       424
   macro avg      0.976     0.976     0.976       424
weighted avg      0.976     0.976     0.976       424



<a id='13'></a>
## 13) [Light Gradient Boosting Machine](<https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>)

* Default hyperparameters

In [30]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('lgbm', lgbm.LGBMClassifier(random_state=13))])

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.972     0.972       212
   Malignant      0.972     0.972     0.972       212

    accuracy                          0.972       424
   macro avg      0.972     0.972     0.972       424
weighted avg      0.972     0.972     0.972       424



* Hyperparameter tuning using Grid Search

In [31]:
# https://neptune.ai/blog/lightgbm-parameters-guide
# https://www.youtube.com/watch?v=5CWwwtEM2TA&ab_channel=PyData & https://github.com/MSusik/newgradientboosting/blob/master/pydata.pdf

param_grid = {
        'lgbm__boosting_type' : ['gbdt','dart'],
        'lgbm__num_leaves' : [10,20,30,40,50],
        'lgbm__max_depth' : [3,6,9,-1],
        'lgbm__learning_rate' : [0.05,0.1,0.3,0.5,1],
        'lgbm__n_estimators' : [50,100,200,500],
        'lgbm__objective' : ['binary'],
        'lgbm__min_child_samples' : [10,20,30],
        'lgbm__subsample' : [0.5],
        'lgbm__reg_lambda' : [0,1],
        'lgbm__reg_alpha' : [0],
        'lgbm__subsample' : [0.5],
        'lgbm__colsample_bytree' : [1],
        'lgbm__scale_pos_weight' : [1],
        'lgbm__random_state' : [13]
    }

grid_search = GridSearchCV(clf_lgbm, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 4800 candidates, totalling 48000 fits

Best hyperparameters :  {'lgbm__boosting_type': 'dart', 'lgbm__colsample_bytree': 1, 'lgbm__learning_rate': 1, 'lgbm__max_depth': 9, 'lgbm__min_child_samples': 20, 'lgbm__n_estimators': 200, 'lgbm__num_leaves': 20, 'lgbm__objective': 'binary', 'lgbm__random_state': 13, 'lgbm__reg_alpha': 0, 'lgbm__reg_lambda': 1, 'lgbm__scale_pos_weight': 1, 'lgbm__subsample': 0.5}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('lgbm',
                 LGBMClassifier(boosting_type='dart', colsample_bytree=1,
                                learning_rate=1, max_depth=9, n_estimators=200,
                                num_leaves=20, objective='binary',
                                random_state=13, reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, subsample=0.5))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbm__boosting_type,param_lgbm__colsample_bytree,param_lgbm__learning_rate,param_lgbm__max_depth,param_lgbm__min_child_samples,param_lgbm__n_estimators,param_lgbm__num_leaves,param_lgbm__objective,param_lgbm__random_state,param_lgbm__reg_alpha,param_lgbm__reg_lambda,param_lgbm__scale_pos_weight,param_lgbm__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
4627,0.06393,0.00226,0.002194,0.000399,dart,1,1,9,20,200,40,binary,13,0,1,1,0.5,"{'lgbm__boosting_type': 'dart', 'lgbm__colsamp...",1.0,1.0,0.952851,0.953261,1.0,0.976177,0.974437,0.975848,1.0,0.975045,0.980762,0.017702,1
4747,0.086723,0.026985,0.002494,0.000499,dart,1,1,-1,20,200,40,binary,13,0,1,1,0.5,"{'lgbm__boosting_type': 'dart', 'lgbm__colsamp...",1.0,1.0,0.952851,0.953261,1.0,0.976177,0.974437,0.975848,1.0,0.975045,0.980762,0.017702,1
4625,0.062989,0.001897,0.003391,0.002054,dart,1,1,9,20,200,30,binary,13,0,1,1,0.5,"{'lgbm__boosting_type': 'dart', 'lgbm__colsamp...",1.0,1.0,0.952851,0.953261,1.0,0.976177,0.974437,0.975848,1.0,0.975045,0.980762,0.017702,1


* Tuned hyperparameters

In [32]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('lgbm',lgbm.LGBMClassifier(boosting_type='dart', colsample_bytree=1,
                                learning_rate=1, max_depth=9, n_estimators=200,
                                num_leaves=20, objective='binary',
                                random_state=13, reg_alpha=0, reg_lambda=1,
                                scale_pos_weight=1, subsample=0.5))])

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.981     0.981     0.981       212
   Malignant      0.981     0.981     0.981       212

    accuracy                          0.981       424
   macro avg      0.981     0.981     0.981       424
weighted avg      0.981     0.981     0.981       424



<a id='14'></a>
## 14) [K-Nearest Neighbors Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html>)

* Default hyperparameters

In [24]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('knn', KNeighborsClassifier())])

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.945     0.981     0.963       212
   Malignant      0.980     0.943     0.962       212

    accuracy                          0.962       424
   macro avg      0.963     0.962     0.962       424
weighted avg      0.963     0.962     0.962       424



* Hyperparameter tuning using Grid Search

In [25]:
param_grid = {
    'knn__n_neighbors': list(range(2,10)),
    'knn__weights': ['uniform','distance'],
    'knn__algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10,20,30,40,50],
    'knn__p': [1,2],
    'knn__metric': ['minkowski','manhattan','chebyshev']
}

grid_search = GridSearchCV(clf_knn, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1440 candidates, totalling 14400 fits

Best hyperparameters :  {'knn__algorithm': 'ball_tree', 'knn__leaf_size': 10, 'knn__metric': 'minkowski', 'knn__n_neighbors': 2, 'knn__p': 1, 'knn__weights': 'distance'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('knn',
                 KNeighborsClassifier(algorithm='ball_tree', leaf_size=10,
                                      n_neighbors=2, p=1,
                                      weights='distance'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__metric,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
801,0.004601,0.000439,0.005051,0.006853,kd_tree,40,manhattan,2,1,distance,"{'knn__algorithm': 'kd_tree', 'knn__leaf_size'...",1.0,0.976282,0.952851,0.953261,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.98106,0.01761,1
131,0.004489,0.000804,0.002692,0.000897,ball_tree,20,manhattan,2,2,distance,"{'knn__algorithm': 'ball_tree', 'knn__leaf_siz...",1.0,0.976282,0.952851,0.953261,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.98106,0.01761,1
1345,0.004049,0.000478,0.002951,0.0012,brute,50,minkowski,2,1,distance,"{'knn__algorithm': 'brute', 'knn__leaf_size': ...",1.0,0.976282,0.952851,0.953261,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.98106,0.01761,1


* Tuned hyperparameters

In [26]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('knn',KNeighborsClassifier(algorithm='ball_tree', leaf_size=10,
                                      n_neighbors=2, p=1,
                                      weights='distance'))])

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.977     0.986     0.981       212
   Malignant      0.986     0.976     0.981       212

    accuracy                          0.981       424
   macro avg      0.981     0.981     0.981       424
weighted avg      0.981     0.981     0.981       424



<a id='15'></a>
## 15) [Multi-layer Perceptron Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html>)

* Default hyperparameters

In [17]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('mlp', MLPClassifier(random_state=13))])

score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.976     0.974       212
   Malignant      0.976     0.972     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



* Hyperparameter tuning using Grid Search

In [18]:
# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
param_grid = {
    'mlp__hidden_layer_sizes' : [(14,28,)],
    'mlp__activation' : ['tanh','relu'],
    'mlp__solver' : ['sgd','adam'],
    'mlp__alpha' : [0.01,0,2],
    'mlp__batch_size' : [40,80,'auto'],
    'mlp__learning_rate' : ['invscaling','adaptive'],
    'mlp__learning_rate_init' : np.power(10, np.arange(-3, 0, dtype=float)),
    'mlp__power_t' : [0.5],
    'mlp__max_iter' : [50,100,200,500],
    'mlp__shuffle' : [True],
    'mlp__random_state' : [13]
}

grid_search = GridSearchCV(clf_mlp, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='rank_test_score').head(3)

Fitting 10 folds for each of 864 candidates, totalling 8640 fits

Best hyperparameters :  {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__batch_size': 40, 'mlp__hidden_layer_sizes': (14, 28), 'mlp__learning_rate': 'adaptive', 'mlp__learning_rate_init': 0.01, 'mlp__max_iter': 100, 'mlp__power_t': 0.5, 'mlp__random_state': 13, 'mlp__shuffle': True, 'mlp__solver': 'sgd'}

Best estimator :  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('mlp',
                 MLPClassifier(alpha=0.01, batch_size=40,
                               hidden_layer_sizes=(14, 28),
                               learning_rate='adaptive',
                               learning_rate_init=0.01, max_iter=100,
                               random_state=13, solver='sgd'))])



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__batch_size,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__learning_rate_init,param_mlp__max_iter,param_mlp__power_t,param_mlp__random_state,param_mlp__shuffle,param_mlp__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
466,0.333707,0.005853,0.001995,0.0004459018,relu,0.01,40,"(14, 28)",adaptive,0.01,100,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 0.01...",0.976282,1.0,0.952851,0.976541,1.0,0.976177,1.0,0.951389,1.0,1.0,0.983324,0.01867,1
485,0.431945,0.012005,0.001995,2.611745e-07,relu,0.01,80,"(14, 28)",invscaling,0.001,200,0.5,13,True,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 0.01...",1.0,1.0,0.929624,0.953261,1.0,0.976177,1.0,0.951389,1.0,1.0,0.981045,0.025446,2
509,0.432643,0.017244,0.001995,1.907349e-07,relu,0.01,80,"(14, 28)",adaptive,0.001,200,0.5,13,True,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 0.01...",1.0,1.0,0.929624,0.953261,1.0,0.976177,1.0,0.951389,1.0,1.0,0.981045,0.025446,2


* Tuned hyperparameters

In [19]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=7)),
                ('mlp',MLPClassifier(alpha=0.01, batch_size=40,
                               hidden_layer_sizes=(14, 28),
                               learning_rate='adaptive',
                               learning_rate_init=0.01, max_iter=100,
                               random_state=13, solver='sgd'))])

score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.977     0.991     0.984       212
   Malignant      0.990     0.976     0.983       212

    accuracy                          0.983       424
   macro avg      0.984     0.983     0.983       424
weighted avg      0.984     0.983     0.983       424



* Tried a larger range of hyperparameters for testing at first, but was too time consuming. The worst attempts were then found with the following code and the hyperparameters corresponding to those results were removed.

In [20]:
# print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=True).head(5) # worst 5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__batch_size,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__learning_rate_init,param_mlp__max_iter,param_mlp__power_t,param_mlp__random_state,param_mlp__shuffle,param_mlp__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
528,0.056947,0.001753,0.002095,0.000299112,relu,0.01,auto,"(14, 28)",invscaling,0.001,50,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 0.01...",0.425766,0.417488,0.39359,0.450284,0.384164,0.458983,0.325926,0.402034,0.475,0.373134,0.410637,0.042398,862
672,0.06413,0.005404,0.002095,0.0005371095,relu,0.0,auto,"(14, 28)",invscaling,0.001,50,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 0, '...",0.425766,0.417488,0.39359,0.450284,0.384164,0.458983,0.325926,0.402034,0.475,0.373134,0.410637,0.042398,862
816,0.069965,0.005504,0.002194,0.0005982797,relu,2.0,auto,"(14, 28)",invscaling,0.001,50,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 2, '...",0.425766,0.417488,0.39359,0.450284,0.384164,0.458983,0.325926,0.402034,0.475,0.373134,0.410637,0.042398,862
532,0.093051,0.011809,0.001995,2.249236e-07,relu,0.01,auto,"(14, 28)",invscaling,0.001,200,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 0.01...",0.425766,0.453721,0.39359,0.450284,0.384164,0.458983,0.325926,0.402034,0.475,0.373134,0.41426,0.044332,853
674,0.111955,0.024028,0.003491,0.004488095,relu,0.0,auto,"(14, 28)",invscaling,0.001,100,0.5,13,True,sgd,"{'mlp__activation': 'relu', 'mlp__alpha': 0, '...",0.425766,0.453721,0.39359,0.450284,0.384164,0.458983,0.325926,0.402034,0.475,0.373134,0.41426,0.044332,853


<a id='16'></a>
## 16) Summary

* Below are the tables of the specific feature selection method.
* The performance of the algorithms is in descending order.
* All the results are the average values of a 10-fold cross validation.
* The columns contain the accuracy and the average values of precision, recall and f1 score.
* It is observed that the number of samples of Βenign and Μalignant cancer are equal (212 respectively), so the weighted average and the macro average are equal.

<table>
    <tr>
        <th colspan="5"> PCA : Default algorithms</th>
    </tr>
    <tr>
        <th></th>
        <th>precision </th>
        <th>recall</th>
        <th>f1 score</th>
        <th>accuracy</th>  
    </tr>
    <tr>
        <th>MLP</th>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
        <td>0.974</td>
    </tr>
    <tr>
        <th>LGBM</th>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
        <td>0.972</td>
    </tr>
    <tr>
        <th>XGBoost</th>
        <td>0.969</td>
        <td>0.969</td>
        <td>0.969</td>
        <td>0.969</td>
    </tr>
    <tr>
        <th>QDA</th>
        <td>0.967</td>
        <td>0.967</td>
        <td>0.967</td>
        <td>0.967</td>
    </tr>
    <tr>
        <th>Random Forest</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>SVC</th>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
        <td>0.965</td>
    </tr>
    <tr>
        <th>KNN</th>
        <td>0.963</td>
        <td>0.962</td>
        <td>0.962</td>
        <td>0.962</td>
    </tr>
    <tr>
        <th>AdaBoost</th>
        <td>0.962</td>
        <td>0.962</td>
        <td>0.962</td>
        <td>0.962</td>
    </tr>
    <tr>
        <th>Ridge</th>
        <td>0.957</td>
        <td>0.955</td>
        <td>0.955</td>
        <td>0.955</td>
    </tr>
    <tr>
        <th>LDA</th>
        <td>0.957</td>
        <td>0.955</td>
        <td>0.955</td>
        <td>0.955</td>
    </tr>
    <tr>
        <th>SGD</th>
        <td>0.950</td>
        <td>0.950</td>
        <td>0.950</td>
        <td>0.950</td>
    </tr>
    <tr>
        <th>Decision Tree</th>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
        <td>0.941</td>
    </tr>
    <tr>
        <th>GNB</th>
        <td>0.940</td>
        <td>0.939</td>
        <td>0.939</td>
        <td>0.939</td>
    </tr>

</table>

<table>
    <tr>
        <th colspan="5"> PCA : Tuned algorithms</th>
    </tr>
    <tr>
        <th></th>
        <th>precision </th>
        <th>recall</th>
        <th>f1 score</th>
        <th>accuracy</th>  
    </tr>
    <tr>
        <th>MLP</th>
        <td>0.984</td>
        <td>0.983</td>
        <td>0.983</td>
        <td>0.983</td>
    </tr>
    <tr>
        <th>LGBM</th>
        <td>0.981</td>
        <td>0.981</td>
        <td>0.981</td>
        <td>0.981</td>
    </tr>
    <tr>
        <th>KNN</th>
        <td>0.981</td>
        <td>0.981</td>
        <td>0.981</td>
        <td>0.981</td>
    </tr>
    <tr>
        <th>AdaBoost</th>
        <td>0.977</td>
        <td>0.976</td>
        <td>0.976</td>
        <td>0.976</td>
    </tr>
    <tr>
        <th>SVC</th>
        <td>0.976</td>
        <td>0.976</td>
        <td>0.976</td>
        <td>0.976</td>
    </tr>
    <tr>
        <th>XGBoost</th>
        <td>0.976</td>
        <td>0.976</td>
        <td>0.976</td>
        <td>0.976</td>
    </tr>
    <tr>
        <th>SGD</th>
        <td>0.969</td>
        <td>0.969</td>
        <td>0.969</td>
        <td>0.969</td>
    </tr>
    <tr>
        <th>QDA</th>
        <td>0.968</td>
        <td>0.967</td>
        <td>0.967</td>
        <td>0.967</td>
    </tr>
    <tr>
        <th>Random Forest</th>
        <td>0.967</td>
        <td>0.967</td>
        <td>0.967</td>
        <td>0.967</td>
    </tr>
    <tr>
        <th>LDA</th>
        <td>0.957</td>
        <td>0.955</td>
        <td>0.955</td>
        <td>0.955</td>
    </tr>
    <tr>
        <th>Ridge</th>
        <td>0.957</td>
        <td>0.955</td>
        <td>0.955</td>
        <td>0.955</td>
    </tr>
    <tr>
        <th>Decision Tree</th>
        <td>0.950</td>
        <td>0.950</td>
        <td>0.950</td>
        <td>0.950</td>
    </tr>
    <tr>
        <th>GNB</th>
        <td>0.940</td>
        <td>0.939</td>
        <td>0.939</td>
        <td>0.939</td>
    </tr>

</table>