<!-- # Διπλωματική Εργασία
## Ταξινόμηση του καρκίνου του μαστού με μεθόδους μηχανικής μάθησης
### Εξαγωγή χαρακτηριστικών με PCA

> Λάζαρος Πανιτσίδης<br />
> Τμήμα Μηχανικών Παραγωγής και Διοίκησης <br />
> Διεθνές Πανεπιστήμιο της Ελλάδος <br />
> lazarospanitsidis@outlook.com -->

# Diploma thesis
## Breast cancer classification using machine learning methods
### Feature extraction with PCA

> Lazaros Panitsidis<br />
> Department of Production and Management Engineering <br />
> International Hellenic University <br />
> lazarospanitsidis@outlook.com

## Contents
1. [Usefull Python Libraries](#1)
1. [Data Processing](#2)
1. [Gaussian Naive Bayes](#3)
1. [Linear Discriminant Analysis](#4)
1. [Quadratic Discriminant Analysis](#5)
1. [Ridge Classifier](#6)
1. [Decision Tree Classifier](#7)
1. [Random Forest Classifier](#8)
1. [ADA Boost Classifier (Adaptive Boosting)](#9)
1. [C-Support Vector Classification](#10)
1. [Stochastic Gradient Descent Classifier](#11)
1. [eXtreme Gradient Boosting](#12)
1. [Light Gradient Boosting Machine](#13)
1. [K-Nearest Neighbors Classifier](#14)
1. [Multi-layer Perceptron Classifier](#15)

<a id='1'></a>
## 1) Usefull Python Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import scipy.stats as stats
import matplotlib.pyplot as plt
import time
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

# some of them are not used in this file
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV , LeaveOneOut,KFold,RandomizedSearchCV
from skopt import BayesSearchCV # https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV , https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score , make_scorer , classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.preprocessing import StandardScaler , LabelEncoder
from xgboost import XGBClassifier , plot_importance
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier , RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.neural_network import MLPClassifier
import pygad

<a id='2'></a>
## 2) Data Processing

In [2]:
dataWISC = pd.read_csv('dataWisc.csv')
dataWISC.drop(["id", "Unnamed: 32"], axis = 1, inplace = True)

# Undersampling function
def make_undersample(_df, column):
  dfs_r = {}
  dfs_c = {}
  smaller = 1000000000000000000000000000000000000000000000000000000000000000000000000000000
  ignore = ""
  for c in _df[column].unique():
    dfs_c[c] = _df[_df[column] == c]
    if dfs_c[c].shape[0] < smaller:
      smaller = dfs_c[c].shape[0]
      ignore = c

  for c in dfs_c:
    if c == ignore:
      continue
    dfs_r[c] = resample(dfs_c[c], 
                        replace=False, # sample without replacement
                        n_samples=smaller,
                        random_state=0)
  return pd.concat([dfs_r[c] for c in dfs_r] + [dfs_c[ignore]])

dataWISC = make_undersample(dataWISC,'diagnosis')

#Description of the dataset

#how many cases are included in the dataset
length = len(dataWISC)
#how many features are in the dataset
features = dataWISC.shape[1]-1 # - diagnosis

# Number of malignant cases
malignant = len(dataWISC[dataWISC['diagnosis']=='M'])

#Number of benign cases
benign = len(dataWISC[dataWISC['diagnosis']=='B'])

#Rate of malignant tumors over all cases
rate = (float(malignant)/(length))*100

print ("There are "+ str(len(dataWISC))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(malignant)+" cases diagnosed as malignant tumor")
print ("There are {}".format(benign)+" cases diagnosed as benign tumor")
print ("The percentage of malignant cases is: {:.2f}%".format(rate))

There are 424 cases in this dataset
There are 30 features in this dataset
There are 212 cases diagnosed as malignant tumor
There are 212 cases diagnosed as benign tumor
The percentage of malignant cases is: 50.00%


In [3]:
y = dataWISC.diagnosis                          # M or B 
x = dataWISC.drop('diagnosis',axis = 1 )
target_names=['Benign','Malignant']
x_scaled = (x - x.mean())/x.std()
le= LabelEncoder()
le.fit(y)
y_le = le.transform(y)

In [4]:
pca = PCA(n_components=7)
pca.fit(x_scaled)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

In [5]:
columns = ['pca_%i' % i for i in range(7)]
x_new = pd.DataFrame(pca.transform(x_scaled), columns=columns, index=x_scaled.index)
x_new.head()

In [6]:
# https://machinelearningmastery.com/loocv-for-evaluating-machine-learning-algorithms/#:~:text=Given%20the%20improved%20estimate%20of,biased%20estimates%20of%20model%20performance.
# cv = LeaveOneOut()

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
cv=KFold(n_splits=10, shuffle=True, random_state=13)

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
  originalclass.extend(y_true)
  predictedclass.extend(y_pred)
  #print(classification_report(y_true, y_pred, target_names=target_names)) 
  return accuracy_score(y_true, y_pred)

def print_best_params(grid_search):
    print("")
    print("Best hyperparameters : ", grid_search.best_params_)
    print("")
    print("Best estimator : ", grid_search.best_estimator_)
    print("")

<a id='3'></a>
## 3) [Gaussian Naive Bayes](<https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>)

* Default hyperparameters

In [15]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = GaussianNB()

score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.919     0.962     0.940       212
   Malignant      0.960     0.915     0.937       212

    accuracy                          0.939       424
   macro avg      0.940     0.939     0.939       424
weighted avg      0.940     0.939     0.939       424



* Hyperparameter tuning using Grid Search

In [87]:
param_grid = { 'var_smoothing': np.logspace(0,-10, num=100) }

grid_search = GridSearchCV(clf_gnb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits

Best hyperparameters :  {'var_smoothing': 0.0037649358067924675}

Best estimator :  GaussianNB(var_smoothing=0.0037649358067924675)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
50,0.002394,0.000798,0.002393,0.001904,9e-06,{'var_smoothing': 8.902150854450392e-06},0.952851,0.976282,0.904444,0.906926,0.976177,0.952273,0.949519,0.855835,0.902778,1.0,0.937709,0.041721,1
75,0.001995,0.000446,0.001995,0.000892,0.0,{'var_smoothing': 2.6560877829466893e-08},0.952851,0.976282,0.904444,0.906926,0.976177,0.952273,0.949519,0.855835,0.902778,1.0,0.937709,0.041721,1
73,0.001894,0.000537,0.001895,0.000699,0.0,{'var_smoothing': 4.229242874389499e-08},0.952851,0.976282,0.904444,0.906926,0.976177,0.952273,0.949519,0.855835,0.902778,1.0,0.937709,0.041721,1


* Tuned hyperparameters

In [89]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_gnb = GaussianNB(var_smoothing=0.0037649358067924675)

score = cross_val_score(clf_gnb, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.919     0.962     0.940       212
   Malignant      0.960     0.915     0.937       212

    accuracy                          0.939       424
   macro avg      0.940     0.939     0.939       424
weighted avg      0.940     0.939     0.939       424



<a id='4'></a>
## 4) [Linear Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html>)

* Default hyperparameters

In [18]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = LinearDiscriminantAnalysis()

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



* Hyperparameter tuning using Grid Search

In [19]:
param_grid = {
    'solver' : ['svd','lsqr','eigen'],
    'shrinkage':[None,'auto'],
    'tol': [0.0001,0.001,0.01,0.1]
}

grid_search = GridSearchCV(clf_lda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 24 candidates, totalling 240 fits

Best hyperparameters :  {'shrinkage': None, 'solver': 'svd', 'tol': 0.0001}

Best estimator :  LinearDiscriminantAnalysis()



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_shrinkage,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006383,0.002102,0.004488,0.003603,,svd,0.0001,"{'shrinkage': None, 'solver': 'svd', 'tol': 0....",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1
7,0.005186,0.00321,0.003092,0.001299,,lsqr,0.1,"{'shrinkage': None, 'solver': 'lsqr', 'tol': 0.1}",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1
11,0.002992,0.000773,0.002992,0.003025,,eigen,0.1,"{'shrinkage': None, 'solver': 'eigen', 'tol': ...",0.906522,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.954638,0.026161,1


* Tuned hyperparameters

In [20]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lda = LinearDiscriminantAnalysis(solver='svd',shrinkage=None,tol=1e-04) # = Default parameters

score = cross_val_score(clf_lda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



<a id='5'></a>
## 5) [Quadratic Discriminant Analysis](<https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html>)

* Default hyperparameters

In [21]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = QuadraticDiscriminantAnalysis()

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.981     0.967       212
   Malignant      0.981     0.953     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



* Hyperparameter tuning using Grid Search

In [22]:
param_grid = {
    'reg_param': np.linspace(0, 1, num=10),
    'tol': [0.0001,0.001,0.01]
}

grid_search = GridSearchCV(clf_qda, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 30 candidates, totalling 300 fits

Best hyperparameters :  {'reg_param': 0.5555555555555556, 'tol': 0.0001}

Best estimator :  QuadraticDiscriminantAnalysis(reg_param=0.5555555555555556)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_param,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
15,0.003342,0.002093,0.001746,0.000748,0.555556,0.0001,"{'reg_param': 0.5555555555555556, 'tol': 0.0001}",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1
17,0.001895,0.000538,0.001496,0.000499,0.555556,0.01,"{'reg_param': 0.5555555555555556, 'tol': 0.01}",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1
16,0.001695,0.000457,0.002294,0.001672,0.555556,0.001,"{'reg_param': 0.5555555555555556, 'tol': 0.001}",0.976282,0.952851,0.952851,0.930081,1.0,0.976068,0.974437,0.951389,0.951389,1.0,0.966535,0.021565,1


* Tuned hyperparameters

In [23]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_qda = QuadraticDiscriminantAnalysis(reg_param=0.5555555555555556)

score = cross_val_score(clf_qda, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.950     0.986     0.968       212
   Malignant      0.985     0.948     0.966       212

    accuracy                          0.967       424
   macro avg      0.968     0.967     0.967       424
weighted avg      0.968     0.967     0.967       424



<a id='6'></a>
## 6) [Ridge Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html#sklearn.linear_model.RidgeClassifier>)

* Default hyperparameters

In [24]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = RidgeClassifier()

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.925     0.991     0.957       212
   Malignant      0.990     0.920     0.954       212

    accuracy                          0.955       424
   macro avg      0.957     0.955     0.955       424
weighted avg      0.957     0.955     0.955       424



* Hyperparameter tuning using Grid Search

In [25]:
param_grid = {
    'alpha' : np.linspace(0, 1, num=10),
    'fit_intercept' : [True,False],
    'copy_X' : [True,False],
    'max_iter' : [None],
    'tol' : [0.001],
    'class_weight' : [None,'balanced'],
    'solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'positive' : [False]
}

grid_search = GridSearchCV(clf_rc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 560 candidates, totalling 5600 fits

Best hyperparameters :  {'alpha': 0.0, 'class_weight': None, 'copy_X': True, 'fit_intercept': False, 'max_iter': None, 'positive': False, 'solver': 'svd', 'tol': 0.001}

Best estimator :  RidgeClassifier(alpha=0.0, fit_intercept=False, solver='svd')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_class_weight,param_copy_X,param_fit_intercept,param_max_iter,param_positive,param_solver,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
558,0.003095,0.000491,0.001602,0.000542,1.0,balanced,False,False,,False,saga,0.001,"{'alpha': 1.0, 'class_weight': 'balanced', 'co...",0.929624,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.956948,0.022586,1
193,0.002992,0.000446,0.003491,0.005503,0.333333,,False,False,,False,sag,0.001,"{'alpha': 0.3333333333333333, 'class_weight': ...",0.929624,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.956948,0.022586,1
175,0.002893,0.000537,0.001695,0.000638,0.333333,,True,False,,False,svd,0.001,"{'alpha': 0.3333333333333333, 'class_weight': ...",0.929624,0.976282,0.952851,0.906522,0.976177,0.976068,0.973668,0.951389,0.951389,0.97551,0.956948,0.022586,1


* Tuned hyperparameters

In [26]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rc = RidgeClassifier(alpha=0.0, fit_intercept=False, solver='svd')

score = cross_val_score(clf_rc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.929     0.991     0.959       212
   Malignant      0.990     0.925     0.956       212

    accuracy                          0.958       424
   macro avg      0.960     0.958     0.958       424
weighted avg      0.960     0.958     0.958       424



<a id='7'></a>
## 7) [Decision Tree Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>)

* Default hyperparameters

In [27]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = DecisionTreeClassifier(random_state=13)

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.929     0.929     0.929       212
   Malignant      0.929     0.929     0.929       212

    accuracy                          0.929       424
   macro avg      0.929     0.929     0.929       424
weighted avg      0.929     0.929     0.929       424



* Hyperparameter tuning using Grid Search

In [28]:
param_grid = {
    'criterion' :['gini','entropy'],
    'splitter' : ['best','random'],
    'max_depth': [2,6,10,None],
    'min_samples_split': list(range(2, 4)),
    'min_samples_leaf': [3,5],
    'min_weight_fraction_leaf' : [0.0],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes' : [None,10,50],
    'min_impurity_decrease' : [0.0],
    'class_weight' : [None,'balanced'],
    'ccp_alpha' : [0.0],
    'random_state' : [13]
}

grid_search = GridSearchCV(clf_tree, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1152 candidates, totalling 11520 fits

Best hyperparameters :  {'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': 50, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 13, 'splitter': 'best'}

Best estimator :  DecisionTreeClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50,
                       min_samples_leaf=5, random_state=13)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_impurity_decrease,param_min_samples_leaf,param_min_samples_split,param_min_weight_fraction_leaf,param_random_state,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
814,0.003092,0.000537,0.001795,0.000399,0.0,balanced,gini,,,50,0.0,5,3,0.0,13,best,"{'ccp_alpha': 0.0, 'class_weight': 'balanced',...",0.929624,0.952851,0.858553,0.930081,0.976177,0.928531,0.974437,0.926531,1.0,0.975045,0.945183,0.03791,1
812,0.003092,0.000537,0.001496,0.000669,0.0,balanced,gini,,,50,0.0,5,2,0.0,13,best,"{'ccp_alpha': 0.0, 'class_weight': 'balanced',...",0.929624,0.952851,0.858553,0.930081,0.976177,0.928531,0.974437,0.926531,1.0,0.975045,0.945183,0.03791,1
742,0.002792,0.000599,0.001696,0.000639,0.0,balanced,gini,10.0,,50,0.0,5,3,0.0,13,best,"{'ccp_alpha': 0.0, 'class_weight': 'balanced',...",0.929624,0.952851,0.858553,0.930081,0.976177,0.928531,0.974437,0.926531,1.0,0.975045,0.945183,0.03791,1


* Tuned hyperparameters

In [29]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_tree = DecisionTreeClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50,
                       min_samples_leaf=5, random_state=13)

score = cross_val_score(clf_tree, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.935     0.958     0.946       212
   Malignant      0.957     0.934     0.945       212

    accuracy                          0.946       424
   macro avg      0.946     0.946     0.946       424
weighted avg      0.946     0.946     0.946       424



<a id='8'></a>
## 8) [Random Forest Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>)

* Default hyperparameters

In [30]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = RandomForestClassifier(random_state=13)
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.953     0.967     0.960       212
   Malignant      0.967     0.953     0.960       212

    accuracy                          0.960       424
   macro avg      0.960     0.960     0.960       424
weighted avg      0.960     0.960     0.960       424



* Hyperparameter tuning using Grid Search

In [81]:
param_grid = { # in the comments are the rest of the hyperparameters tested
    'bootstrap': [True], # ,False
    'max_depth': [5, 10 , None],
    'n_estimators' : [50,100], # 10 ,200, 500,1000
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes' : [None,5,10],
    'min_samples_leaf': [1,3,5],
    'min_samples_split': list(range(2, 6)),
    'criterion' :['entropy'], # 'gini' same scores
    'random_state' : [13]
}

grid_search = GridSearchCV(clf_rf, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits

Best hyperparameters :  {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50, 'random_state': 13}

Best estimator :  RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None,
                       min_samples_split=3, n_estimators=50, random_state=13)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,param_min_samples_split,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,0.086768,0.004699,0.006582,0.001903,True,entropy,5,,,1,3,50,13,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.976282,0.976282,0.952851,0.953261,0.952381,0.952273,0.974437,0.927545,1.0,0.975045,0.964036,0.019157,1
4,0.085636,0.003162,0.006843,0.002284,True,entropy,5,,,1,4,50,13,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.976282,0.976282,0.952851,0.953261,0.952381,0.952273,0.974437,0.927545,1.0,0.975045,0.964036,0.019157,1
6,0.086671,0.004671,0.006109,0.001156,True,entropy,5,,,1,5,50,13,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.976282,0.976282,0.952851,0.953261,0.952381,0.952273,0.974437,0.927545,1.0,0.975045,0.964036,0.019157,1


* Tuned hyperparameters

In [71]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_rf = RandomForestClassifier(criterion='entropy', max_depth=5, max_features=None,
                       min_samples_split=3, n_estimators=50, random_state=13)
                       
score = cross_val_score(clf_rf, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.958     0.972     0.965       212
   Malignant      0.971     0.958     0.964       212

    accuracy                          0.965       424
   macro avg      0.965     0.965     0.965       424
weighted avg      0.965     0.965     0.965       424



<a id='9'></a>
## 9) [ADA Boost Classifier (Adaptive Boosting)](<https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#:~:text=An%20AdaBoost%20%5B1%5D%20classifier%20is,focus%20more%20on%20difficult%20cases.>)

* Default hyperparameters

In [40]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = AdaBoostClassifier(random_state=13)

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.953     0.948     0.950       212
   Malignant      0.948     0.953     0.951       212

    accuracy                          0.950       424
   macro avg      0.950     0.950     0.950       424
weighted avg      0.950     0.950     0.950       424



* Hyperparameter tuning using Grid Search

In [41]:
param_grid = {
    'base_estimator' : [DecisionTreeClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50,min_samples_leaf=5, random_state=13)],
    'n_estimators' : [10,50,100,500],
    'learning_rate' : np.power(10, np.arange(-3, 1, dtype=float)),
    'algorithm' : ['SAMME', 'SAMME.R'],
    'random_state' : [13],
}

grid_search = GridSearchCV(clf_adaboost, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 32 candidates, totalling 320 fits

Best hyperparameters :  {'algorithm': 'SAMME', 'base_estimator': DecisionTreeClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50,
                       min_samples_leaf=5, random_state=13), 'learning_rate': 1.0, 'n_estimators': 500, 'random_state': 13}

Best estimator :  AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                         max_depth=10,
                                                         max_leaf_nodes=50,
                                                         min_samples_leaf=5,
                                                         random_state=13),
                   n_estimators=500, random_state=13)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_base_estimator,param_learning_rate,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
15,2.299311,0.383601,0.086721,0.043515,SAMME,DecisionTreeClassifier(class_weight='balanced'...,1.0,500,13,"{'algorithm': 'SAMME', 'base_estimator': Decis...",0.976282,0.976282,0.952851,0.953261,1.0,0.952273,0.949519,0.975848,1.0,1.0,0.973632,0.019973,1
10,0.416702,0.079054,0.014563,0.009328,SAMME,DecisionTreeClassifier(class_weight='balanced'...,0.1,100,13,"{'algorithm': 'SAMME', 'base_estimator': Decis...",0.952851,0.976282,0.952851,0.953261,1.0,0.928531,0.974437,0.975848,1.0,1.0,0.971406,0.023155,2
11,2.267109,0.343976,0.055854,0.014127,SAMME,DecisionTreeClassifier(class_weight='balanced'...,0.1,500,13,"{'algorithm': 'SAMME', 'base_estimator': Decis...",0.952851,0.976282,0.929624,0.976541,0.976177,0.976177,0.949519,0.975848,1.0,1.0,0.971302,0.020784,3


* Tuned hyperparameters

In [42]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_adaboost = AdaBoostClassifier(DecisionTreeClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50,min_samples_leaf=5, random_state=13),
                    n_estimators=500,learning_rate=1.0,algorithm='SAMME', random_state=13)

score = cross_val_score(clf_adaboost, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.967     0.981     0.974       212
   Malignant      0.981     0.967     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



<a id='10'></a>
## 10) [C-Support Vector Classification](<https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>)

* Default hyperparameters

In [43]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = SVC()

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.981     0.967       212
   Malignant      0.981     0.953     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



* Hyperparameter tuning using Grid Search

In [44]:
param_grid = [
    {
        'kernel': ['rbf'], 
        'gamma': [1e-2, 1e-3, 1e-4,'auto','scale'], 
        'C': [1, 10, 100, 1000],
        'decision_function_shape': ['ovo', 'ovr'],
        'random_state' : [13]
    },
    {
        'kernel': ['linear'], 
        'C': [1, 10, 100, 1000],
        'decision_function_shape': ['ovo', 'ovr'],
        'random_state' : [13]
    },
]

grid_search = GridSearchCV(clf_svc, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 48 candidates, totalling 480 fits

Best hyperparameters :  {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'rbf', 'random_state': 13}

Best estimator :  SVC(C=10, decision_function_shape='ovo', random_state=13)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_decision_function_shape,param_gamma,param_kernel,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
14,0.002992,3.277714e-07,0.001895,0.000299,10,ovo,scale,rbf,13,"{'C': 10, 'decision_function_shape': 'ovo', 'g...",0.976282,1.0,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.97603,0.023858,1
19,0.004289,0.003889759,0.001895,0.000299,10,ovr,scale,rbf,13,"{'C': 10, 'decision_function_shape': 'ovr', 'g...",0.976282,1.0,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.97603,0.023858,1
25,0.00399,0.001339287,0.001496,0.000498,100,ovr,0.01,rbf,13,"{'C': 100, 'decision_function_shape': 'ovr', '...",0.976282,0.952851,0.952851,0.929624,1.0,0.976177,0.949519,0.975848,1.0,1.0,0.971315,0.023308,3


* Tuned hyperparameters

In [45]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_svc = SVC(C=10, decision_function_shape='ovo', gamma='scale' ,kernel='rbf',random_state=13)

# 0.974 loocv: C=10, decision_function_shape='ovo',gamma=0.01,kernel='rbf'
# 0.976 k=10 : C=1000, decision_function_shape='ovo', gamma=0.01 ,kernel='rbf'
# * 0.976 k=10 : C=10, decision_function_shape='ovo', gamma='scale' ,kernel='rbf'

score = cross_val_score(clf_svc, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.976     0.976     0.976       212
   Malignant      0.976     0.976     0.976       212

    accuracy                          0.976       424
   macro avg      0.976     0.976     0.976       424
weighted avg      0.976     0.976     0.976       424



<a id='11'></a>
## 11) [Stochastic Gradient Descent Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html>)

* Default hyperparameters

In [46]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_sgd = SGDClassifier(random_state=13)

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.962     0.962     0.962       212
   Malignant      0.962     0.962     0.962       212

    accuracy                          0.962       424
   macro avg      0.962     0.962     0.962       424
weighted avg      0.962     0.962     0.962       424



* Hyperparameter tuning using Grid Search

In [47]:
param_grid = {
    'average': [True, False],
    'l1_ratio': np.linspace(0, 1, num=10),
    'alpha': np.power(10, np.arange(-2, 1, dtype=float)),
    'random_state' : [13]
}

grid_search = GridSearchCV(clf_sgd, param_grid=param_grid, n_jobs=-1, cv=cv,verbose=4,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 60 candidates, totalling 600 fits

Best hyperparameters :  {'alpha': 0.1, 'average': False, 'l1_ratio': 0.0, 'random_state': 13}

Best estimator :  SGDClassifier(alpha=0.1, l1_ratio=0.0, random_state=13)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_average,param_l1_ratio,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
30,0.002294,0.000779,0.001596,0.000489,0.1,False,0.0,13,"{'alpha': 0.1, 'average': False, 'l1_ratio': 0...",0.929624,0.976282,0.929624,0.952851,0.976177,0.976068,1.0,0.975848,0.97551,0.975045,0.966703,0.021332,1
39,0.002001,0.000625,0.002794,0.003504,0.1,False,1.0,13,"{'alpha': 0.1, 'average': False, 'l1_ratio': 1...",0.929624,0.976282,0.929624,0.952851,0.976177,0.976068,1.0,0.975848,0.97551,0.975045,0.966703,0.021332,1
38,0.001895,0.000299,0.001596,0.000489,0.1,False,0.888889,13,"{'alpha': 0.1, 'average': False, 'l1_ratio': 0...",0.929624,0.976282,0.929624,0.952851,0.976177,0.976068,1.0,0.975848,0.97551,0.975045,0.966703,0.021332,1


* Tuned hyperparameters

In [48]:
originalclass = []
predictedclass = []

# Cross validate
clf_sgd = SGDClassifier(alpha=0.1, average=False, l1_ratio=0.0,random_state=13) #0.974 loocv: alpha=0.01, average=False,l1_ratio=0.0

score = cross_val_score(clf_sgd, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.954     0.981     0.967       212
   Malignant      0.981     0.953     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



<a id='12'></a>
## 12) [eXtreme Gradient Boosting](<https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters>)

* Default hyperparameters

In [49]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_xgb = XGBClassifier(random_state=13)

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.976     0.974       212
   Malignant      0.976     0.972     0.974       212

    accuracy                          0.974       424
   macro avg      0.974     0.974     0.974       424
weighted avg      0.974     0.974     0.974       424



* Hyperparameter tuning using Grid Search

In [50]:
# https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
# https://www.cs.cornell.edu/courses/cs4780/2018sp/lectures/lecturenote19.html
# https://medium.com/data-design/xgboost-hi-im-gamma-what-can-i-do-for-you-and-the-tuning-of-regularization-a42ea17e6ab6

param_grid = {
        'booster' : ['gbtree'],
        'validate_parameters' : [True],
        'learning_rate' : [0.05,0.1,0.3,0.5,1],
        'gamma' : [0,0.01,0.1,0.5,1],
        'max_depth' : [2,6,10],
        'min_child_weight' : [1,3,5],
        'max_delta_step' : [0,2,4],
        'subsample' : [0.5],
        'colsample_bylevel' : [1],
        'colsample_bynode' : [1],
        'colsample_bytree' : [1],
        'reg_lambda' : [0,1],
        'reg_alpha' : [0],
        'tree_method' : ['exact'],
        'scale_pos_weight' : [1],
        'objective' : ['binary:logistic'], # 'multi:softmax' -> same scores as 'binary:logistic'
        #'num_class' : [2],
        'n_estimators' : [50,100,200,500],
        'random_state' : [13]
    }

grid_search = GridSearchCV(clf_xgb, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 5400 candidates, totalling 54000 fits

Best hyperparameters :  {'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 1, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200, 'objective': 'binary:logistic', 'random_state': 13, 'reg_alpha': 0, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.5, 'tree_method': 'exact', 'validate_parameters': True}

Best estimator :  XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
       

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_colsample_bylevel,param_colsample_bynode,param_colsample_bytree,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,param_objective,param_random_state,param_reg_alpha,param_reg_lambda,param_scale_pos_weight,param_subsample,param_tree_method,param_validate_parameters,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
4372,0.121176,0.00485,0.002693,0.000457,gbtree,1,1,1,1,0.05,0,10,1,200,binary:logistic,13,0,0,1,0.5,exact,True,"{'booster': 'gbtree', 'colsample_bylevel': 1, ...",1.0,0.976282,0.952851,0.953261,1.0,0.952273,0.974437,0.975848,1.0,0.975045,0.976,0.018296,1
4492,0.139427,0.022527,0.002992,0.000892,gbtree,1,1,1,1,0.05,4,6,1,200,binary:logistic,13,0,0,1,0.5,exact,True,"{'booster': 'gbtree', 'colsample_bylevel': 1, ...",1.0,0.976282,0.952851,0.953261,1.0,0.952273,0.974437,0.975848,1.0,0.975045,0.976,0.018296,1
4348,0.118583,0.004358,0.002294,0.000457,gbtree,1,1,1,1,0.05,0,6,1,200,binary:logistic,13,0,0,1,0.5,exact,True,"{'booster': 'gbtree', 'colsample_bylevel': 1, ...",1.0,0.976282,0.952851,0.953261,1.0,0.952273,0.974437,0.975848,1.0,0.975045,0.976,0.018296,1


* Tuned hyperparameters

In [51]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_xgb = XGBClassifier(gamma=1,learning_rate=0.1,max_delta_step = 2,max_depth=2,min_child_weight=1,tree_method='exact',subsample=0.5,reg_alpha=0,reg_lambda=0,scale_pos_weight=1,
              objective='binary:logistic', predictor='auto',n_estimators=100, random_state=13)

score = cross_val_score(clf_xgb, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.981     0.977       212
   Malignant      0.981     0.972     0.976       212

    accuracy                          0.976       424
   macro avg      0.976     0.976     0.976       424
weighted avg      0.976     0.976     0.976       424



<a id='13'></a>
## 13) [Light Gradient Boosting Machine](<https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html>)

* Default hyperparameters

In [52]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = lgbm.LGBMClassifier(random_state=13)

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.958     0.976     0.967       212
   Malignant      0.976     0.958     0.967       212

    accuracy                          0.967       424
   macro avg      0.967     0.967     0.967       424
weighted avg      0.967     0.967     0.967       424



* Hyperparameter tuning using Grid Search

In [53]:
# https://neptune.ai/blog/lightgbm-parameters-guide
# https://www.youtube.com/watch?v=5CWwwtEM2TA&ab_channel=PyData & https://github.com/MSusik/newgradientboosting/blob/master/pydata.pdf

param_grid = {
        'boosting_type' : ['gbdt','dart'],
        'num_leaves' : [10,20,30,40,50],
        'max_depth' : [3,6,9,-1],
        'learning_rate' : [0.05,0.1,0.3,0.5,1],
        'n_estimators' : [50,100,200,500],
        'objective' : ['binary'],
        'min_child_samples' : [10,20,30],
        'subsample' : [0.5],
        'reg_lambda' : [0,1],
        'reg_alpha' : [0],
        'subsample' : [0.5],
        'colsample_bytree' : [1],
        'scale_pos_weight' : [1],
        'random_state' : [13]
    }

grid_search = GridSearchCV(clf_lgbm, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y_le)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 4800 candidates, totalling 48000 fits

Best hyperparameters :  {'boosting_type': 'dart', 'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_samples': 10, 'n_estimators': 500, 'num_leaves': 20, 'objective': 'binary', 'random_state': 13, 'reg_alpha': 0, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.5}

Best estimator :  LGBMClassifier(boosting_type='dart', colsample_bytree=1, max_depth=6,
               min_child_samples=10, n_estimators=500, num_leaves=20,
               objective='binary', random_state=13, reg_alpha=0, reg_lambda=0,
               scale_pos_weight=1, subsample=0.5)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_colsample_bytree,param_learning_rate,param_max_depth,param_min_child_samples,param_n_estimators,param_num_leaves,param_objective,param_random_state,param_reg_alpha,param_reg_lambda,param_scale_pos_weight,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3032,0.480285,0.062549,0.003092,0.000829,dart,1,0.1,6,10,500,20,binary,13,0,0,1,0.5,"{'boosting_type': 'dart', 'colsample_bytree': ...",0.976282,0.976282,0.952851,0.953261,1.0,0.976177,0.974437,0.951945,1.0,1.0,0.976124,0.018336,1
3152,0.470758,0.045808,0.003442,0.000849,dart,1,0.1,9,10,500,20,binary,13,0,0,1,0.5,"{'boosting_type': 'dart', 'colsample_bytree': ...",0.976282,0.976282,0.952851,0.953261,1.0,0.976177,0.974437,0.951945,1.0,1.0,0.976124,0.018336,1
3966,0.025133,0.001657,0.001695,0.000639,dart,1,0.5,6,10,50,40,binary,13,0,0,1,0.5,"{'boosting_type': 'dart', 'colsample_bytree': ...",1.0,0.976282,0.952851,0.953261,1.0,0.952273,0.974437,0.951945,1.0,1.0,0.976105,0.021216,3


* Tuned hyperparameters

In [54]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_lgbm = lgbm.LGBMClassifier(boosting_type='dart', colsample_bytree=1, max_depth=6,
               min_child_samples=10, n_estimators=500, num_leaves=20,
               objective='binary', random_state=13, reg_alpha=0, reg_lambda=0,
               scale_pos_weight=1, subsample=0.5)

score = cross_val_score(clf_lgbm, x_new, y_le, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.972     0.981     0.977       212
   Malignant      0.981     0.972     0.976       212

    accuracy                          0.976       424
   macro avg      0.976     0.976     0.976       424
weighted avg      0.976     0.976     0.976       424



<a id='14'></a>
## 14) [K-Nearest Neighbors Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html>)

* Default hyperparameters

In [55]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = KNeighborsClassifier()

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.945     0.981     0.963       212
   Malignant      0.980     0.943     0.962       212

    accuracy                          0.962       424
   macro avg      0.963     0.962     0.962       424
weighted avg      0.963     0.962     0.962       424



* Hyperparameter tuning using Grid Search

In [56]:
param_grid = {
    'n_neighbors': list(range(2,8)),
    'weights': ['uniform','distance'],
    'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10,20,30,40,50],
    'p': [1,2],
    'metric': ['minkowski','manhattan','chebyshev']
}

grid_search = GridSearchCV(clf_knn, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=False).head(3)

Fitting 10 folds for each of 1080 candidates, totalling 10800 fits

Best hyperparameters :  {'algorithm': 'ball_tree', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 2, 'p': 1, 'weights': 'distance'}

Best estimator :  KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=2, p=1,
                     weights='distance')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_metric,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
961,0.001396,0.0004886555,0.001895,0.0005371582,brute,40,manhattan,2,1,distance,"{'algorithm': 'brute', 'leaf_size': 40, 'metri...",1.0,1.0,0.952851,0.929624,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.981068,0.023043,1
457,0.001995,2.780415e-07,0.001995,3.576279e-07,kd_tree,20,manhattan,2,1,distance,"{'algorithm': 'kd_tree', 'leaf_size': 20, 'met...",1.0,1.0,0.952851,0.929624,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.981068,0.023043,1
505,0.001496,0.0004985811,0.002394,0.0004886167,kd_tree,30,minkowski,2,1,distance,"{'algorithm': 'kd_tree', 'leaf_size': 30, 'met...",1.0,1.0,0.952851,0.929624,0.976177,0.976177,1.0,0.975848,1.0,1.0,0.981068,0.023043,1


* Tuned hyperparameters

In [57]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=2, p=1,weights='distance', metric_params=None, n_jobs=-1)

score = cross_val_score(clf_knn, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.981     0.981     0.981       212
   Malignant      0.981     0.981     0.981       212

    accuracy                          0.981       424
   macro avg      0.981     0.981     0.981       424
weighted avg      0.981     0.981     0.981       424



<a id='15'></a>
## 15) [Multi-layer Perceptron Classifier](<https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html>)

* Default hyperparameters

In [58]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  MLPClassifier()

score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.977     0.981     0.979       212
   Malignant      0.981     0.976     0.979       212

    accuracy                          0.979       424
   macro avg      0.979     0.979     0.979       424
weighted avg      0.979     0.979     0.979       424



* Hyperparameter tuning using Grid Search

In [59]:
# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
param_grid = {
    'hidden_layer_sizes' : [(14,28,)],
    'activation' : ['tanh','relu'],
    'solver' : ['sgd','adam'],
    'alpha' : [0.01,0,2],
    'batch_size' : [40,80,'auto'],
    'learning_rate' : ['invscaling','adaptive'],
    'learning_rate_init' : np.power(10, np.arange(-3, 0, dtype=float)),
    'power_t' : [0.5],
    'max_iter' : [100,200,500],
    'shuffle' : [True],
    'random_state' : [13]
}

grid_search = GridSearchCV(clf_mlp, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')
grid_search.fit(x_new, y)

print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='rank_test_score').head(3)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits

Best hyperparameters :  {'activation': 'tanh', 'alpha': 0.01, 'batch_size': 80, 'hidden_layer_sizes': (14, 28), 'learning_rate': 'adaptive', 'learning_rate_init': 0.01, 'max_iter': 500, 'power_t': 0.5, 'random_state': 13, 'shuffle': True, 'solver': 'sgd'}

Best estimator :  MLPClassifier(activation='tanh', alpha=0.01, batch_size=80,
              hidden_layer_sizes=(14, 28), learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=500, random_state=13,
              solver='sgd')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_batch_size,param_hidden_layer_sizes,param_learning_rate,param_learning_rate_init,param_max_iter,param_power_t,param_random_state,param_shuffle,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
172,0.989251,0.09404,0.001696,0.000639,tanh,0.0,80,"(14, 28)",adaptive,0.01,500,0.5,13,True,sgd,"{'activation': 'tanh', 'alpha': 0, 'batch_size...",0.976282,1.0,0.952851,0.976541,1.0,1.0,1.0,0.975848,1.0,1.0,0.988152,0.01586,1
64,0.941085,0.085751,0.001596,0.000489,tanh,0.01,80,"(14, 28)",adaptive,0.01,500,0.5,13,True,sgd,"{'activation': 'tanh', 'alpha': 0.01, 'batch_s...",0.976282,1.0,0.952851,0.976541,1.0,1.0,1.0,0.975848,1.0,1.0,0.988152,0.01586,1
134,0.669311,0.011471,0.001596,0.000488,tanh,0.0,40,"(14, 28)",adaptive,0.01,200,0.5,13,True,sgd,"{'activation': 'tanh', 'alpha': 0, 'batch_size...",0.976282,1.0,0.952851,0.976541,0.976177,1.0,1.0,0.975848,1.0,1.0,0.98577,0.01569,3


* Tuned hyperparameters

In [63]:
originalclass = []
predictedclass = []
  
# Cross validate
clf_mlp =  MLPClassifier(activation='tanh', alpha=0.01, batch_size=80,
              hidden_layer_sizes=(14, 28), learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=500, random_state=13,
              solver='sgd',shuffle=True)

score = cross_val_score(clf_mlp, x_new, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.981     0.995     0.988       212
   Malignant      0.995     0.981     0.988       212

    accuracy                          0.988       424
   macro avg      0.988     0.988     0.988       424
weighted avg      0.988     0.988     0.988       424



* Tried a larger range of hyperparameters for testing at first, but was too time consuming. The worst attempts were then found with the following code and the hyperparameters corresponding to those results were removed.

In [67]:
# print_best_params(grid_search)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.sort_values(by='mean_test_score',ascending=True).head(5) # worst 5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_batch_size,param_hidden_layer_sizes,param_learning_rate,param_learning_rate_init,param_max_iter,param_power_t,param_random_state,param_shuffle,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
506,0.090362,0.007103,0.001895,0.000698,relu,0.0,auto,"(14, 28)",invscaling,0.001,200,0.5,13,True,sgd,"{'activation': 'relu', 'alpha': 0, 'batch_size...",0.389205,0.453721,0.351475,0.436163,0.384164,0.458983,0.289028,0.402034,0.489879,0.373134,0.402779,0.056068,640
614,0.123435,0.052425,0.001596,0.000489,relu,2.0,auto,"(14, 28)",invscaling,0.001,200,0.5,13,True,sgd,"{'activation': 'relu', 'alpha': 2, 'batch_size...",0.389205,0.453721,0.351475,0.436163,0.384164,0.458983,0.289028,0.402034,0.489879,0.373134,0.402779,0.056068,640
612,0.096995,0.022331,0.001796,0.000599,relu,2.0,auto,"(14, 28)",invscaling,0.001,100,0.5,13,True,sgd,"{'activation': 'relu', 'alpha': 2, 'batch_size...",0.389205,0.453721,0.351475,0.436163,0.384164,0.458983,0.289028,0.402034,0.489879,0.373134,0.402779,0.056068,640
616,0.092456,0.010642,0.001746,0.000402,relu,2.0,auto,"(14, 28)",invscaling,0.001,500,0.5,13,True,sgd,"{'activation': 'relu', 'alpha': 2, 'batch_size...",0.389205,0.453721,0.351475,0.436163,0.384164,0.458983,0.289028,0.402034,0.489879,0.373134,0.402779,0.056068,640
398,0.086272,0.009214,0.001396,0.000489,relu,0.01,auto,"(14, 28)",invscaling,0.001,200,0.5,13,True,sgd,"{'activation': 'relu', 'alpha': 0.01, 'batch_s...",0.389205,0.453721,0.351475,0.436163,0.384164,0.458983,0.289028,0.402034,0.489879,0.373134,0.402779,0.056068,640
