In [1]:
import sklearn.datasets as ds
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [3]:
cancers = ds.load_breast_cancer()
data_ = (list(i)+[cancers['target_names'][j]] for i,j in zip(cancers['data'],cancers['target']))
cancers_pd = pd.DataFrame(data_, columns=list(cancers['feature_names'])+['cancer type'])

print(cancers_pd)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0         17.990         10.38          122.80     1001.0          0.11840   
1         20.570         17.77          132.90     1326.0          0.08474   
2         19.690         21.25          130.00     1203.0          0.10960   
3         11.420         20.38           77.58      386.1          0.14250   
4         20.290         14.34          135.10     1297.0          0.10030   
5         12.450         15.70           82.57      477.1          0.12780   
6         18.250         19.98          119.60     1040.0          0.09463   
7         13.710         20.83           90.20      577.9          0.11890   
8         13.000         21.82           87.50      519.8          0.12730   
9         12.460         24.04           83.97      475.9          0.11860   
10        16.020         23.24          102.70      797.8          0.08206   
11        15.780         17.89          103.60      781.0       

In [10]:
data, target, target_names = cancers['data'], cancers['target'], cancers['target_names']
print(target_names)

['malignant' 'benign']


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size = 0.3, random_state=7)
kflod = StratifiedKFold(n_splits=10, random_state=1, shuffle=False)

In [11]:
"""Decesion Tree"""
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

max_depth = [10, 15, 20, 30, None]
min_samples_split = [2, 3, 5, 8, 10]
min_samples_leaf = [1, 2, 3, 5, 8]
param_grid = [{'criterion': criterion, 'max_depth':max_depth, 'min_samples_split':min_samples_split, 'min_samples_leaf':min_samples_leaf}]
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring="accuracy", cv=kflod)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
print("Decesion Tree")
print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
print("Accuracy:", best_model.score(X_test, Y_test))
print(confusion_matrix(Y_test, best_model.predict(X_test)))

TP = np.logical_and(best_model.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(best_model.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(best_model.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(best_model.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, best_model.predict(X_test))
print("F1_score:", f1_score_value)

Decesion Tree
0.9371859296482412 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 3}
Accuracy: 0.9415204678362573
[[ 50   5]
 [  5 111]]
TP:111, FP:5, FN:5, TN:50, Precesion:0.956897, Recall:0.956897
F1_score: 0.9568965517241379


In [12]:
"""SVM"""
from sklearn.svm import SVC
clf = SVC()

C = [0.1, 0.5, 1, 5, 10]
kernel = ['linear']
param_grid = [{'C': C, 'kernel':kernel}]
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=kflod)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
print("\n SVM")
print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
print("Accuracy:", best_model.score(X_test, Y_test))
print(confusion_matrix(Y_test, best_model.predict(X_test)))

TP = np.logical_and(best_model.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(best_model.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(best_model.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(best_model.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, best_model.predict(X_test))
print("F1_score:", f1_score_value)


 SVM
0.9623115577889447 SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) {'C': 1, 'kernel': 'linear'}
Accuracy: 0.9532163742690059
[[ 48   7]
 [  1 115]]
TP:115, FP:7, FN:1, TN:48, Precesion:0.942623, Recall:0.991379
F1_score: 0.9663865546218487


In [13]:
"""Bayes"""
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.01)

clf.fit(X_train,Y_train)
print("\n MultinomialNB")
print("Accuracy:", clf.score(X_test,Y_test))
print(confusion_matrix(Y_test, clf.predict(X_test)))

TP = np.logical_and(clf.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(clf.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(clf.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(clf.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, clf.predict(X_test))
print("F1_score:", f1_score_value)


 MultinomialNB
Accuracy: 0.9064327485380117
[[ 40  15]
 [  1 115]]
TP:115, FP:15, FN:1, TN:40, Precesion:0.884615, Recall:0.991379
F1_score: 0.9349593495934959


In [14]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

n_neighbors = [1,2,3,5,8,10,15,20,25,30,35,40]
weights = ['uniform','distance']
param_grid = [{'n_neighbors': n_neighbors, 'weights': weights}]
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=kflod)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
print("\n KNN")
print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
print("Accuracy:", best_model.score(X_test, Y_test))
print(confusion_matrix(Y_test, best_model.predict(X_test)))

TP = np.logical_and(best_model.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(best_model.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(best_model.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(best_model.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, best_model.predict(X_test))
print("F1_score:", f1_score_value)


 KNN
0.9346733668341709 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance') {'n_neighbors': 10, 'weights': 'distance'}
Accuracy: 0.9532163742690059
[[ 50   5]
 [  3 113]]
TP:113, FP:5, FN:3, TN:50, Precesion:0.957627, Recall:0.974138
F1_score: 0.9658119658119658


In [15]:
"""gaussian"""
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

clf.fit(X_train,Y_train)
print("\n GaussianNB")
print("Accuracy:", clf.score(X_test,Y_test))
print(confusion_matrix(Y_test, clf.predict(X_test)))

TP = np.logical_and(clf.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(clf.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(clf.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(clf.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, clf.predict(X_test))
print("F1_score:", f1_score_value)


 GaussianNB
Accuracy: 0.9532163742690059
[[ 49   6]
 [  2 114]]
TP:114, FP:6, FN:2, TN:49, Precesion:0.950000, Recall:0.982759
F1_score: 0.9661016949152542


In [19]:
"""Random Forest"""
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)

n_estimators = [10, 20, 35, 50, 80, 100, 120, 150, 200]
param_grid = [{'n_estimators': n_estimators}]
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=kflod)
grid_search.fit(X_train, Y_train)
best_model = grid_search.best_estimator_
print("\nRandomForest")
print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
print("Accuracy:", best_model.score(X_test, Y_test))
print(confusion_matrix(Y_test, best_model.predict(X_test)))

TP = np.logical_and(best_model.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(best_model.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(best_model.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(best_model.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, best_model.predict(X_test))
print("F1_score:", f1_score_value)


RandomForest
0.9547738693467337 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False) {'n_estimators': 50}
Accuracy: 0.9590643274853801
[[ 50   5]
 [  2 114]]
TP:114, FP:5, FN:2, TN:50, Precesion:0.957983, Recall:0.982759
F1_score: 0.9702127659574468


In [23]:
"""XGboost"""
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

clf = XGBClassifier()
clf.fit(X_train,Y_train)
print("\nXGboost")
print("Accuracy:", clf.score(X_test,Y_test))
print(confusion_matrix(Y_test, clf.predict(X_test)))

TP = np.logical_and(clf.predict(X_test)==1, Y_test==1).sum()
FP = np.logical_and(clf.predict(X_test)==1, Y_test==0).sum()
FN = np.logical_and(clf.predict(X_test)==0, Y_test==1).sum()
TN = np.logical_and(clf.predict(X_test)==0, Y_test==0).sum()
print("TP:%d, FP:%d, FN:%d, TN:%d, Precesion:%f, Recall:%f" %(TP, FP, FN, TN, TP/(TP+FP), TP/(TP+FN)))

f1_score_value = f1_score(Y_test, clf.predict(X_test))
print("F1_score:", f1_score_value)


XGboost
Accuracy: 0.9766081871345029
[[ 51   4]
 [  0 116]]
TP:116, FP:4, FN:0, TN:51, Precesion:0.966667, Recall:1.000000
F1_score: 0.983050847457627
