# Imports

In [1]:
import pandas as pd

# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

# Read the data
#### This a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. 
#### "Class" is our target: 0 for benign, 1 for malignant.

In [2]:
df =  pd.read_csv('data/breast-cancer-wisconsin.csv', index_col='Sample code number')

In [3]:
df.head()

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Sample code number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,0
1002945,5,4,4,5,7,10,3,2,1,0
1015425,3,1,1,1,2,2,3,1,1,0
1016277,6,8,8,1,3,4,3,7,1,0
1017023,4,1,1,3,2,1,3,1,1,0


In [4]:
df.shape

(683, 10)

# Dividing in two sets
#### To have an idea of how different classifiers perform on datasets of different sizes, lets create a smaller dataset, with 100 random samples.

In [5]:
df_small = df.sample(100, random_state=1)
df_complete = df

In [6]:
# Assign the features and the target
features_small = df_small.drop(['Class'], axis=1)
target_small = df_small['Class']

features_complete = df_complete.drop(['Class'], axis=1)
target_complete = df_complete['Class']

# Train/test split

In [7]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(features_small, target_small, test_size=0.3, random_state=1)

X_train_complete, X_test_complete, y_train_complete, y_test_complete = train_test_split(features_complete, target_complete, test_size=0.3, random_state=1)

# Modelling

In [8]:
# Choose the classifer
classifier = GaussianNB()

# Fit the classifier
classifier.fit(X_train_small, y_train_small)

# Make the predictions
y_pred_small = classifier.predict(X_test_small)

# Score the predictions
score_small = roc_auc_score(y_test_small, y_pred_small)

print("Small dataset")
print("ROC AUC: " + str(score_small))

print("Number of mislabeled points out of a total %d points: %d" % (y_test_small.shape[0],(y_test_small != y_pred_small).sum()) + "\n")

# Fit the classifier
classifier.fit(X_train_complete, y_train_complete)

# Make the predictions
y_pred_complete = classifier.predict(X_test_complete)

# Score the predictions
score_complete = roc_auc_score(y_test_complete, y_pred_complete)

print("Complete dataset")
print("ROC AUC: " + str(score_complete))

print("Number of mislabeled points out of a total %d points: %d" % (y_test_complete.shape[0],(y_test_complete != y_pred_complete).sum()))


Small dataset
ROC AUC: 0.947368421053
Number of mislabeled points out of a total 30 points: 2

Complete dataset
ROC AUC: 0.977443609023
Number of mislabeled points out of a total 205 points: 6
