## Import Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r'data/breast-cancer.csv')

In [3]:
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


## Pre-processing

#### Change Classification values (from two, one to one, zero each)

In [4]:
data['Classification'] = data['Classification'] - 1

In [5]:
data = data.reindex(np.random.permutation(data.index))

In [6]:
X = data.drop(['Classification'], axis = 1)

In [7]:
X = (X - X.mean()) / X.std()

In [8]:
y = data['Classification']

In [9]:
X.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
86,-0.577289,0.108142,-0.345973,-0.742179,-0.585141,-0.57772,0.006055,0.111726,3.364413
90,-1.07379,0.648273,1.341029,3.166731,2.891878,0.230587,-0.587415,0.228323,0.299538
40,1.160463,0.325947,-0.656737,-0.460488,-0.43776,0.10149,-0.410752,-0.539295,0.474519
39,1.222526,1.594741,-0.9675,-0.608982,-0.540199,-0.251718,-0.300339,0.204633,0.241752
79,-1.011727,-0.18114,-0.03521,1.194,0.707508,0.943051,0.484265,1.057777,0.720266


In [10]:
y.head()

86    1
90    1
40    0
39    0
79    1
Name: Classification, dtype: int64

## Classifiers

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

### Classifier List
0. OneR
1. Nearest Neighbors
2. Linear SVM (Kernel: Linear)
3. RBF SVM (Kernel: RBF)
4. Decision Tree
5. Random Forest
6. Neural Network
7. AdaBoost (Based on the Decision Tree classifier)
8. Gaussian Naive Bayes

In [12]:
names = ["OneR", "Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes"]

In [13]:
classifiers = [
    DecisionTreeClassifier(max_depth=1),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(kernel="rbf", C=0.025),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(max_depth=8, n_estimators=50),
    MLPClassifier(alpha=1, hidden_layer_sizes = (50, 30)),
    AdaBoostClassifier(n_estimators = 100),
    GaussianNB()]

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_fscore_support

In [15]:
evaluations = []

##### ZeroR

In [16]:
max_index = data.groupby(['Classification'])['Classification'].count().idxmax()

In [17]:
evaluations.append(['ZeroR', data.groupby(['Classification'])['Classification'].count()[max_index] / len(y)])

#### Using Classifier List above

In [18]:
for name, clf in zip(names, classifiers):
    y_pred = cross_val_predict(clf, X, y, cv = 10)
    precision_recall_f1 = precision_recall_fscore_support(y, y_pred, average='weighted')
    scores = cross_validate(clf, X, y, cv = 10)
    evaluations.append([name, scores['test_score'].mean(), precision_recall_f1[:3]])

  'precision', 'predicted', average, warn_for)


In [19]:
evaluations.sort(key = lambda evaluations: evaluations[1])
evaluations.reverse()

In [20]:
for evaluation in evaluations:
    print("-----" * 5)
    print(evaluation[0])
    print("Accuracy: ", evaluation[1])
    if len(evaluation) > 2: print("Precision, Recall, F1", evaluation[2])
    print("-----" * 5, end = "\n\n")

-------------------------
Neural Net
Accuracy:  0.7469114219114219
Precision, Recall, F1 (0.7520119225037258, 0.75, 0.7505046728971962)
-------------------------

-------------------------
Linear SVM
Accuracy:  0.7203962703962704
Precision, Recall, F1 (0.7253944712231698, 0.7241379310344828, 0.7245510218834876)
-------------------------

-------------------------
Random Forest
Accuracy:  0.7058857808857809
Precision, Recall, F1 (0.7145469818282348, 0.7155172413793104, 0.7136807536081583)
-------------------------

-------------------------
Decision Tree
Accuracy:  0.6953962703962705
Precision, Recall, F1 (0.697789566755084, 0.6982758620689655, 0.6979806551905803)
-------------------------

-------------------------
OneR
Accuracy:  0.6891025641025641
Precision, Recall, F1 (0.6909941086804269, 0.6896551724137931, 0.6901198996189237)
-------------------------

-------------------------
Nearest Neighbors
Accuracy:  0.6820512820512821
Precision, Recall, F1 (0.6905390821727736, 0.68103448275