In [3]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import the classifiers we will be using
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

[The Wisconsin breast cancer dataset](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data) is a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.

`diagnosis` is our target: 0 for benign, 1 for malignant.

First we'll read in our dataset and create the train/test split:

In [4]:
dataset = pd.read_csv('data/breast_cancer.csv', index_col='id')

features = dataset.drop('diagnosis', axis=1)
label = dataset['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.33, random_state=0)

We will briefly go through all the algorithms that were taught in the Learning Notebook.

# [K-Nearest-Neighbors Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [5]:
# Choose the model
knn = KNeighborsClassifier()

# Fit the model
knn = knn.fit(X_train, y_train)

# Make the predictions
knn_preds = knn.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, knn_preds[:,1])

print("ROC AUC: %f" % score)

ROC AUC: 0.967497


# [Gaussian Naive Bayes Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

In [6]:
# Choose the model
naive_bayes = GaussianNB()

# Fit the model
naive_bayes = naive_bayes.fit(X_train, y_train)

# Make the predictions
naive_bayes_preds = naive_bayes.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, naive_bayes_preds[:,1])

print("ROC AUC: %f" % score)

ROC AUC: 0.985568


# [Decision Tree Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [8]:
# Choose the model
decision_tree = DecisionTreeClassifier()

# Fit the model
decision_tree = decision_tree.fit(X_train, y_train)

# Make the predictions
decision_tree_preds = decision_tree.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, decision_tree_preds[:,1])

print("ROC AUC: %f" % score)

ROC AUC: 0.913902


# [Random Forest Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [9]:
# Choose the model
random_forest = RandomForestClassifier()

# Fit the model
random_forest = random_forest.fit(X_train, y_train)

# Make the predictions
random_forest_preds = random_forest.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, random_forest_preds[:,1])

print("ROC AUC: %f" % score)

ROC AUC: 0.993462
