# Diploma thesis
## Breast cancer classification using machine learning methods
### Best algorithms

> Lazaros Panitsidis<br />
> Department of Production and Management Engineering <br />
> International Hellenic University <br />
> lazarospanitsidis@outlook.com

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import scipy.stats as stats
import matplotlib.pyplot as plt
import time
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

# some of them are not used in this file
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV , LeaveOneOut,KFold,RandomizedSearchCV,StratifiedKFold,HalvingGridSearchCV
from skopt import BayesSearchCV # https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV , https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score , make_scorer , classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.preprocessing import StandardScaler , LabelEncoder
from xgboost import XGBClassifier , plot_importance
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier , RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgbm
from sklearn.neural_network import MLPClassifier
import pygad

In [5]:
dataWISC = pd.read_csv('dataWisc.csv')
dataWISC.drop(["id", "Unnamed: 32"], axis = 1, inplace = True)

# Undersampling function
def make_undersample(_df, column):
  dfs_r = {}
  dfs_c = {}
  smaller = 1e1000
  ignore = ""
  for c in _df[column].unique():
    dfs_c[c] = _df[_df[column] == c]
    if dfs_c[c].shape[0] < smaller:
      smaller = dfs_c[c].shape[0]
      ignore = c

  for c in dfs_c:
    if c == ignore:
      continue
    dfs_r[c] = resample(dfs_c[c], 
                        replace=False, # sample without replacement
                        n_samples=smaller,
                        random_state=0)
  return pd.concat([dfs_r[c] for c in dfs_r] + [dfs_c[ignore]])

dataWISC = make_undersample(dataWISC,'diagnosis')

#Description of the dataset

#how many cases are included in the dataset
length = len(dataWISC)
#how many features are in the dataset
features = dataWISC.shape[1]-1 # - diagnosis

# Number of malignant cases
malignant = len(dataWISC[dataWISC['diagnosis']=='M'])

#Number of benign cases
benign = len(dataWISC[dataWISC['diagnosis']=='B'])

#Rate of malignant tumors over all cases
rate = (float(malignant)/(length))*100

print ("There are "+ str(len(dataWISC))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(malignant)+" cases diagnosed as malignant tumor")
print ("There are {}".format(benign)+" cases diagnosed as benign tumor")
print ("The percentage of malignant cases is: {:.2f}%".format(rate))

There are 424 cases in this dataset
There are 30 features in this dataset
There are 212 cases diagnosed as malignant tumor
There are 212 cases diagnosed as benign tumor
The percentage of malignant cases is: 50.00%


In [6]:
y = dataWISC.diagnosis                          # M or B 
x = dataWISC.drop('diagnosis',axis = 1 )
target_names=['Benign','Malignant']
le= LabelEncoder()
le.fit(y)
y_le = le.transform(y)

x_rf_xgb = x[['area_se','texture_mean','area_mean','smoothness_worst','concavity_worst','symmetry_worst','symmetry_se','concave points_se','concavity_se']]

In [7]:
# https://machinelearningmastery.com/loocv-for-evaluating-machine-learning-algorithms/#:~:text=Given%20the%20improved%20estimate%20of,biased%20estimates%20of%20model%20performance.
# cv = LeaveOneOut()

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
  originalclass.extend(y_true)
  predictedclass.extend(y_pred)
  #print(classification_report(y_true, y_pred, target_names=target_names)) 
  return accuracy_score(y_true, y_pred)

def print_best_params(grid_search):
    print("")
    print("Best hyperparameters : ", grid_search.best_params_)
    print("")
    print("Best estimator : ", grid_search.best_estimator_)
    print("")

In [5]:
# split data train 90 % and test 10 %

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1,stratify=y)

scaler= StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
# PCA
pca = PCA(n_components=7)
pca.fit(x_train_scaled)
x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

mlp=MLPClassifier(alpha=0.01, batch_size=40,
                               hidden_layer_sizes=(14, 28),
                               learning_rate='adaptive',
                               learning_rate_init=0.01, max_iter=100,
                               random_state=13, solver='sgd')

In [6]:
mlp.fit(x_train_pca,y_train)
y_pred = mlp.predict(x_test_pca)
print(classification_report(y_test, y_pred, target_names=target_names,digits=3))
cm = confusion_matrix(y_test,y_pred)

# sns.set(font_scale = 1.4)
# sns.heatmap(cm,annot=True,fmt="d")

              precision    recall  f1-score   support

      Benign      1.000     1.000     1.000        22
   Malignant      1.000     1.000     1.000        21

    accuracy                          1.000        43
   macro avg      1.000     1.000     1.000        43
weighted avg      1.000     1.000     1.000        43



### NESTED CROSS VALIDATION MLP

In [8]:
clf_mlp =  Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('mlp', MLPClassifier(random_state=13))])
param_grid = {
    'mlp__hidden_layer_sizes' : [(14,28,)],
    'mlp__activation' : ['tanh','relu'],
    'mlp__solver' : ['sgd','adam'],
    'mlp__alpha' : [0.01,0,2],
    'mlp__batch_size' : [40,80,'auto'],
    'mlp__learning_rate' : ['invscaling','adaptive'],
    'mlp__learning_rate_init' : np.power(10, np.arange(-3, 0, dtype=float)),
    'mlp__power_t' : [0.5],
    'mlp__max_iter' : [50,100,200,500],
    'mlp__shuffle' : [True],
    'mlp__random_state' : [13]
}

grid_search = GridSearchCV(clf_mlp, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=0,scoring='f1_macro')

originalclass = []
predictedclass = []
  
# Νested - Cross validate
score = cross_val_score(grid_search, x, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

In [10]:
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.958     0.962     0.960       212
   Malignant      0.962     0.958     0.960       212

    accuracy                          0.960       424
   macro avg      0.960     0.960     0.960       424
weighted avg      0.960     0.960     0.960       424



### NESTED CROSS VALIDATION KNN

In [9]:
clf_knn = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('knn', KNeighborsClassifier())])

param_grid = {
    'knn__n_neighbors': list(range(2,10)),
    'knn__weights': ['uniform','distance'],
    'knn__algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10,20,30,40,50],
    'knn__p': [1,2],
    'knn__metric': ['minkowski','manhattan','chebyshev']
}

grid_search = GridSearchCV(clf_knn, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=0,scoring='f1_macro')

originalclass = []
predictedclass = []

score = cross_val_score(grid_search, x, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

              precision    recall  f1-score   support

      Benign      0.959     0.981     0.970       212
   Malignant      0.981     0.958     0.969       212

    accuracy                          0.969       424
   macro avg      0.970     0.969     0.969       424
weighted avg      0.970     0.969     0.969       424



In [8]:
clf_knn = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=7)), ('knn', KNeighborsClassifier())])

param_grid = {
    'knn__n_neighbors': list(range(2,10)),
    'knn__weights': ['uniform','distance'],
    'knn__algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10,20,30,40,50],
    'knn__p': [1,2],
    'knn__metric': ['minkowski','manhattan','chebyshev']
}

grid_search = HalvingGridSearchCV(clf_knn, param_grid=param_grid, n_jobs=-1,cv=cv,verbose=1,scoring='f1_macro')

originalclass = []
predictedclass = []

score = cross_val_score(grid_search, x, y, scoring=make_scorer(classification_report_with_accuracy_score),cv=cv)
print(classification_report(originalclass, predictedclass, target_names=target_names, digits=3))

n_iterations: 3
n_required_iterations: 7
n_possible_iterations: 3
min_resources_: 40
max_resources_: 381
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1440
n_resources: 40
Fitting 10 folds for each of 1440 candidates, totalling 14400 fits
----------
iter: 1
n_candidates: 480
n_resources: 120
Fitting 10 folds for each of 480 candidates, totalling 4800 fits
----------
iter: 2
n_candidates: 160
n_resources: 360
Fitting 10 folds for each of 160 candidates, totalling 1600 fits
n_iterations: 3
n_required_iterations: 7
n_possible_iterations: 3
min_resources_: 40
max_resources_: 381
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1440
n_resources: 40
Fitting 10 folds for each of 1440 candidates, totalling 14400 fits
----------
iter: 1
n_candidates: 480
n_resources: 120
Fitting 10 folds for each of 480 candidates, totalling 4800 fits
----------
iter: 2
n_candidates: 160
n_resources: 360
Fitting 10 folds for each of 160 candidates, totalling 16