In [1]:
#import necessary libraries
import os
import time
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
import seaborn as sns
from matplotlib import pyplot


#import libraries from sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
#from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score,accuracy_score, roc_auc_score, recall_score, precision_score

##### new import try
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA

In [None]:
 # load dataset
 df = pd.read_csv("breast_cancer_data.csv")
 df = df.drop(['id','Unnamed: 32'], axis =1)
  
 diag_map = {'M':1, 'B':0}
 df['diagnosis'] = df['diagnosis'].map(diag_map)  
 
# get data
def get_dataset():
    # set X,y
    y = df['diagnosis']
    X = df.drop(['diagnosis'], axis = 1)
    
    # Scaler dataset without target
    sc = StandardScaler()
    X = sc.fit_transform(X)
  
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
    
    return X_train, X_test, Y_train, Y_test

# define PCA
def pca_component(X):
    
    pca_breast = PCA(n_components=8)
    pca_breast.fit(X)
    pca_breastX = pca_breast.fit_transform(X)
   
    final_data = pca_breastX
    return final_data

#feature scaling
def get_scaler(X, y):
    
    sc = StandardScaler()
    X = sc.fit_transform(X)
    y = sc.transform(y)
   # print("after normalization: ",X_train.head())
    return X, y

# get models
def get_basemodels():
    
    models= dict()
    #level 1 models
    
    models['svm'] = SVC(kernel='rbf', C=100, gamma=0.0001, probability=True)
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier() 
    models['rf'] = RandomForestClassifier()
    models['bays'] = GaussianNB()
       
    #level stacking
    models['stacking_one'] = get_stacking()
    #models['stacking_two'] = get_stacking_two()
   
    return models

# get a stacking ensemble of models first time
def get_stacking():

    level0 = list()
    level0.append(('svm', SVC(kernel='rbf', C=100, gamma=0.0001,probability=True) ) )
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
   
    level1 = get_stacking_two()
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=7, passthrough=True)
  
    return model


# get a stacking ensemble of models second time
def get_stacking_two():

    level2 = list()
   
    level2.append(('rf' , RandomForestClassifier()))
    level2.append(('bays', GaussianNB()))   
   
    level3 = LogisticRegression(penalty='l2')
   
    model = StackingClassifier(estimators=level2, final_estimator=level3, cv=7, passthrough=True)
    
    return model

# draw confusion matrix
 def plot_confusion_matrix(y_predict):
        
    con_matrix = confusion_matrix(Y_test, y_predict, labels=[0, 1])
 
    df_cm = pd.DataFrame(con_matrix, index = [i for i in ["0","1"]],
                         columns = [i for i in ["Predict benign","Predict Malignant"]])
   
    pyplot.title(eva_model)
    pyplot.figure(figsize = (8,6))
    sns.heatmap(df_cm, annot=True)

# evaluate a given model using cross-validation
def evaluate_model(eva_model, X, y):
       
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = cross_val_score(eva_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    
    start_time = time.time()
    #fit model
    model_fit = eva_model.fit(X,y)
    fit_time = time.time() - start_time
    
    y_predict = model_fit.predict(X_test)
    print("model fit time: ",eva_model," : ", fit_time*1000)
   
    return scores, y_predict

#Calling function
# define dataset
X_train, X_test, Y_train, Y_test = get_dataset()

# get the models to evaluate
models = get_basemodels()

# evaluate the models and store results
results, names = list(), list()

for name, eva_model in models.items():
    print("base model :", eva_model)
    scores, y_predict = evaluate_model(eva_model, X_train, Y_train)
    results.append(scores)
    names.append(name)
    
    print('model name: ',names)
    
    print('Accuracy:  \n ' + str(accuracy_score(Y_test, y_predict)*100) )
   

base model : SVC(C=100, gamma=0.0001, probability=True)
model fit time:  SVC(C=100, gamma=0.0001, probability=True)  :  16.95537567138672
model name:  ['svm']
Accuracy:  
 98.83040935672514
base model : KNeighborsClassifier()
model fit time:  KNeighborsClassifier()  :  1.9915103912353516
model name:  ['svm', 'knn']
Accuracy:  
 95.90643274853801
base model : DecisionTreeClassifier()
model fit time:  DecisionTreeClassifier()  :  4.985332489013672
model name:  ['svm', 'knn', 'cart']
Accuracy:  
 91.22807017543859
base model : RandomForestClassifier()
model fit time:  RandomForestClassifier()  :  221.4510440826416
model name:  ['svm', 'knn', 'cart', 'rf']
Accuracy:  
 97.07602339181285
base model : GaussianNB()
model fit time:  GaussianNB()  :  0.9949207305908203
model name:  ['svm', 'knn', 'cart', 'rf', 'bays']
Accuracy:  
 93.56725146198829
base model : StackingClassifier(cv=7,
                   estimators=[('svm',
                                SVC(C=100, gamma=0.0001, probability=Tr