# Data
#### We'll be using the "mammographic masses" public dataset from the UCI repository (source: https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass)

The data is stored in the ./data/ directory

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [2]:
headers = ["BI-RADS", "Age", "Shape", "Margin", "Density", "Severity"]
dataset = pd.read_csv('./data/mammographic_masses.data', header=None, names=headers, na_values='?')
dataset.head()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [3]:
dataset.describe()

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [4]:
dataset.isna().sum()

BI-RADS      2
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

This is a binary classification problem, our goal is to correctly predict the severity of a cancer based on the other features. 

The data work-flow is inspired by this video: https://www.youtube.com/watch?v=MpFZUshKypk

In [6]:
def stage_details(f):
    def wrapper(dataframe, *args, **kwargs):
        result = f(dataframe, *args, **kwargs)
        print(f'<{f.__name__}> dataframe shape: {result.shape}')
        return result
    
    return wrapper

In [7]:
@stage_details
def start_pipeline(dataframe):
    return dataframe.copy()

@stage_details
def format_data(dataframe):
    dataframe = dataframe.drop(columns=["BI-RADS"])
    return dataframe

@stage_details
def clean_data(dataframe):
    imp = IterativeImputer(max_iter=10, random_state=0)
    clean = dataframe.dropna()
    imp.fit(clean)
    imputed = np.round(imp.transform(dataframe))
    
    imputed_dataframe = pd.DataFrame(data=imputed, columns=["Age", "Shape", "Margin", "Density", "Severity"])
    return imputed_dataframe

@stage_details
def normalise_data(dataframe):
    scaler = StandardScaler()
    
    labels = dataframe['Severity']
    dataframe = dataframe.drop(columns=['Severity'])
    
    scaler.fit(dataframe)
    normalised_data = scaler.transform(dataframe)
    normalised_dataframe = pd.DataFrame(data=normalised_data, columns=["Age", "Shape", "Margin", "Density"])
    normalised_dataframe['Severity'] = labels
    return normalised_dataframe

@stage_details
def shuffle_data(dataframe):
    return dataframe.sample(frac=1)

@stage_details
def visualise_data(dataframe):
    sns.pairplot(dataframe, hue='Severity', height=2.5)
    return dataframe


In [8]:
prepared_data = (dataset.pipe(start_pipeline)
        .pipe(format_data)
        .pipe(clean_data)
        .pipe(normalise_data)
        .pipe(shuffle_data))

X = prepared_data.drop(columns=['Severity'])
Y = prepared_data['Severity']

<start_pipeline> dataframe shape: (961, 6)
<format_data> dataframe shape: (961, 5)
<clean_data> dataframe shape: (961, 5)
<normalise_data> dataframe shape: (961, 5)
<shuffle_data> dataframe shape: (961, 5)


# Approach \#1, Decision Tree

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
TrainX, TestX, TrainY, TestY = train_test_split(X, Y, train_size=0.25, shuffle=True)

print(f'Original: X: {X.shape}, Y: {Y.shape}')
print(f'Train: X: {TrainX.shape}, Y: {TrainY.shape}.')
print(f'Test: X: {TestX.shape}, Y: {TestY.shape}.')

Original: X: (961, 4), Y: (961,)
Train: X: (240, 4), Y: (240,).
Test: X: (721, 4), Y: (721,).


In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
Tree = DecisionTreeClassifier()
s = Tree.fit(TrainX, TrainY)


In [13]:
s.score(TestX, TestY)

0.7226074895977809

Use cross validation score instead of naive train-test split 

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
scores = cross_val_score(s, TestX, y=TestY, cv=10, scoring='accuracy')
print(f"accuracy: {np.round(scores.mean(), decimals=3)} (+/- {np.round(scores.std() * 2, decimals=3)})")

accuracy: 0.742 (+/- 0.078)


In [16]:
from IPython.display import Image  
from sklearn import tree
from sklearn.externals.six import StringIO  
import pydotplus

dot_data = StringIO()  
tree.export_graphviz(s, out_file=dot_data,  
                         feature_names=["Age", "Shape", "Margin", "Density"])  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
with open("tree.png", "wb") as png:
    png.write(graph.create_png())

img



NameError: name 'img' is not defined

### Now to use a RandomForestClassifier instead

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
RFclassifier = RandomForestClassifier(n_estimators=100)
RFclassifier.fit(TrainX, TrainY)

RFscores = cross_val_score(RFclassifier, TestX, y=TestY, cv=10, scoring='accuracy')
print(f"accuracy: {np.round(RFscores.mean(), decimals=3)} (+/- {np.round(RFscores.std() * 2, decimals=3)})")

accuracy: 0.771 (+/- 0.102)


# Approach \#2, SVM classifier

In [19]:
from sklearn.svm import SVC

In [20]:
SVClassifier = SVC(C=1, kernel='rbf', gamma='auto')
SVClassifier.fit(TrainX, TrainY)
SVCscores = cross_val_score(SVClassifier, TestX, y=TestY, cv=10, scoring='accuracy')
print(f"accuracy: {np.round(SVCscores.mean(), decimals=3)} (+/- {np.round(SVCscores.std() * 2, decimals=3)})")

accuracy: 0.792 (+/- 0.088)


# Approach \#3, K-Nearest-Neighbor classifier

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [22]:
best_acc = 0.0
best_acc_i = None
for i in range(1, 50):
    KNNClassifier = KNeighborsClassifier(n_neighbors=i)
    KNNClassifier.fit(TrainX, TrainY)
    KNNScores = cross_val_score(KNNClassifier, TestX, y=TestY, cv=10, scoring='accuracy')
    if np.round(KNNScores.mean(), decimals=3) > best_acc:
        best_acc = np.round(KNNScores.mean(), decimals=3)
        best_acc_i = i
        print(f'i: {i}')
        print(f"accuracy: {np.round(KNNScores.mean(), decimals=3)} (+/- {np.round(KNNScores.std() * 2, decimals=3)})")

i: 1
accuracy: 0.725 (+/- 0.086)
i: 3
accuracy: 0.771 (+/- 0.082)
i: 5
accuracy: 0.784 (+/- 0.064)
i: 6
accuracy: 0.785 (+/- 0.082)
i: 7
accuracy: 0.797 (+/- 0.096)
i: 13
accuracy: 0.8 (+/- 0.067)
i: 15
accuracy: 0.802 (+/- 0.087)


# Approach \#4, Naive Bayes classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

In [24]:
BayesClassifier = MultinomialNB()
scaler = MinMaxScaler()
scaler = scaler.fit(X)
scaledX = scaler.transform(X)

BayesTrainX, BayesTestX, BayesTrainY, BayesTestY = train_test_split(scaledX, Y, train_size=0.25, shuffle=True)

BayesClassifier = BayesClassifier.fit(BayesTrainX, BayesTrainY)

BayesScores = cross_val_score(BayesClassifier, BayesTestX, y=BayesTestY, cv=10, scoring='accuracy')
print(f"accuracy: {np.round(BayesScores.mean(), decimals=3)} (+/- {np.round(BayesScores.std() * 2, decimals=3)})")

accuracy: 0.786 (+/- 0.092)


# Approach \#5, Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
RegressionClassifier = LogisticRegression(solver='liblinear')
RegressionClassifier.fit(TrainX, TrainY)
RegressionScores = cross_val_score(RegressionClassifier, TestX, TestY, cv=10)
print(f"accuracy: {np.round(RegressionScores.mean(), decimals=3)} (+/- {np.round(RegressionScores.std() * 2, decimals=3)})")

accuracy: 0.799 (+/- 0.102)


# Approach \#6, Neural Network

In [27]:
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [28]:
def create_model():
    input_1 = Input(shape=(4,))
    
    dense_1 = Dense(256, activation='relu')(input_1)
    dropout_1 = Dropout(0.4)(dense_1)
    dense_2 = Dense(256, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.4)(dense_2)
    
    output_1 = Dense(1, activation='sigmoid')(dropout_2)
    model = Model(inputs=[input_1], outputs=[output_1])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [32]:
type(np.array(TrainY))

numpy.ndarray

In [33]:
NNClassifier = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=2)
NNClassifier.fit(np.array(TrainX), np.array(TrainY))

Train on 240 samples
Epoch 1/100
240/240 - 1s - loss: 0.5740 - accuracy: 0.7375
Epoch 2/100
240/240 - 0s - loss: 0.4745 - accuracy: 0.7917
Epoch 3/100
240/240 - 0s - loss: 0.4660 - accuracy: 0.7875
Epoch 4/100
240/240 - 0s - loss: 0.4633 - accuracy: 0.8000
Epoch 5/100
240/240 - 0s - loss: 0.4559 - accuracy: 0.7958
Epoch 6/100
240/240 - 0s - loss: 0.4499 - accuracy: 0.8083
Epoch 7/100
240/240 - 0s - loss: 0.4296 - accuracy: 0.8250
Epoch 8/100
240/240 - 0s - loss: 0.4594 - accuracy: 0.8167
Epoch 9/100
240/240 - 0s - loss: 0.4352 - accuracy: 0.8208
Epoch 10/100
240/240 - 0s - loss: 0.4444 - accuracy: 0.8250
Epoch 11/100
240/240 - 0s - loss: 0.4509 - accuracy: 0.8292
Epoch 12/100
240/240 - 0s - loss: 0.4514 - accuracy: 0.8167
Epoch 13/100
240/240 - 0s - loss: 0.4363 - accuracy: 0.8250
Epoch 14/100
240/240 - 0s - loss: 0.4344 - accuracy: 0.8292
Epoch 15/100
240/240 - 0s - loss: 0.4261 - accuracy: 0.8458
Epoch 16/100
240/240 - 0s - loss: 0.4329 - accuracy: 0.8333
Epoch 17/100
240/240 - 0s - 

<tensorflow.python.keras.callbacks.History at 0x7f7130409d50>

In [None]:
NNScores = cross_val_score(NNClassifier, np.array(TestX), np.array(TestY), cv=10)
print(f"accuracy: {np.round(NNScores.mean(), decimals=3)} (+/- {np.round(NNScores.std() * 2, decimals=3)})")

Train on 648 samples
Epoch 1/100
648/648 - 1s - loss: 0.4881 - accuracy: 0.7886
Epoch 2/100
648/648 - 0s - loss: 0.4664 - accuracy: 0.7917
Epoch 3/100
648/648 - 0s - loss: 0.4594 - accuracy: 0.8009
Epoch 4/100
648/648 - 0s - loss: 0.4519 - accuracy: 0.8086
Epoch 5/100
648/648 - 0s - loss: 0.4460 - accuracy: 0.8133
Epoch 6/100
648/648 - 0s - loss: 0.4514 - accuracy: 0.8133
Epoch 7/100
648/648 - 0s - loss: 0.4504 - accuracy: 0.8071
Epoch 8/100
648/648 - 0s - loss: 0.4529 - accuracy: 0.8117
Epoch 9/100
648/648 - 0s - loss: 0.4501 - accuracy: 0.8056
Epoch 10/100
648/648 - 0s - loss: 0.4496 - accuracy: 0.8056
Epoch 11/100
648/648 - 0s - loss: 0.4542 - accuracy: 0.8056
Epoch 12/100
648/648 - 0s - loss: 0.4389 - accuracy: 0.8056
Epoch 13/100
648/648 - 0s - loss: 0.4421 - accuracy: 0.8102
Epoch 14/100
648/648 - 0s - loss: 0.4393 - accuracy: 0.8117
Epoch 15/100
648/648 - 0s - loss: 0.4375 - accuracy: 0.8102
Epoch 16/100
648/648 - 0s - loss: 0.4302 - accuracy: 0.8194
Epoch 17/100
648/648 - 0s - 