

# Predict whether a mammogram mass is benign or malignant

We'll be using the "mammographic masses" public dataset from the UCI repository (source: https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass)

This data contains 961 instances of masses detected in mammograms, and contains the following attributes:


   1. BI-RADS assessment: 1 to 5 (ordinal)  
   2. Age: patient's age in years (integer)
   3. Shape: mass shape: round=1 oval=2 lobular=3 irregular=4 (nominal)
   4. Margin: mass margin: circumscribed=1 microlobulated=2 obscured=3 ill-defined=4 spiculated=5 (nominal)
   5. Density: mass density high=1 iso=2 low=3 fat-containing=4 (ordinal)
   6. Severity: benign=0 or malignant=1 (binominal)
   
BI-RADS is an assesment of how confident the severity classification is; it is not a "predictive" attribute and so we will discard it. The age, shape, margin, and density attributes are the features that we will build our model with, and "severity" is the classification we will attempt to predict based on those attributes.

Although "shape" and "margin" are nominal data types, which sklearn typically doesn't deal with well, they are close enough to ordinal that we shouldn't just discard them. The "shape" for example is ordered increasingly from round to irregular.

A lot of unnecessary anguish and surgery arises from false positives arising from mammogram results. If we can build a better way to interpret them through supervised machine learning, it could improve a lot of lives.


Apply several different supervised machine learning techniques to this data set, and see which one yields the highest accuracy as measured with K-Fold cross validation (K=10). Apply:

* Decision tree
* Random forest
* KNN
* Naive Bayes
* SVM
* Logistic Regression
* And, as a bonus challenge, a neural network using Keras.



In [18]:
import pandas as pd
masses_data = pd.read_csv('mammographic_masses.data.txt')
masses_data.head()

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


In [19]:
masses_data = pd.read_csv('mammographic_masses.data.txt', na_values = ['?'], names =['BI-RADS','age','shape','margin','density','severity'])
masses_data.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [20]:
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [21]:
masses_data.loc[(masses_data['age'].isnull())  |
               (masses_data['shape'].isnull()) |
               (masses_data['margin'].isnull())|
               (masses_data['density'].isnull())]

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


In [22]:
masses_data.dropna(inplace =True)
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [23]:
all_features = masses_data[['age','shape','margin','density']].values

all_classes = masses_data['severity'].values
feature_name = ['age','shape','margin','density']
all_features

array([[ 67.,   3.,   5.,   3.],
       [ 58.,   4.,   5.,   3.],
       [ 28.,   1.,   1.,   3.],
       ..., 
       [ 64.,   4.,   5.,   3.],
       [ 66.,   4.,   5.,   3.],
       [ 62.,   3.,   3.,   3.]])

Some of our models require the input data to be normalized, so go ahead and normalize the attribute data. Hint: use preprocessing.StandardScaler().

In [24]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
all_features_scaled = scaler.fit_transform(all_features)
all_features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ..., 
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

## Decision Trees



In [25]:
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(1234)
(training_inputs,
testing_inputs,
training_classes,
testing_classes)= train_test_split(all_features_scaled, all_classes, train_size= 0.75, random_state =1,test_size=0.25)

Now create a DecisionTreeClassifier and fit it to your training data.

In [26]:
from sklearn.tree import DecisionTreeClassifier
clf =DecisionTreeClassifier(random_state=1)

clf.fit(training_inputs, training_classes)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

Display the resulting decision tree.

In [27]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydotplus

dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=feature_name)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  


NameError: name 'tree' is not defined

Measure the accuracy of the resulting decision tree model using your test data.

In [28]:
clf.score(testing_inputs, testing_classes)

0.73557692307692313

Now instead of a single train/test split, use K-Fold cross validation to get a better measure of your model's accuracy (K=10). Hint: use model_selection.cross_val_score

In [29]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=1)
cv_score=cross_val_score(clf, all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.73735569455522443

Now try a RandomForestClassifier instead. Does it perform better?

In [30]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, random_state=1)
cv_scores = cross_val_score(clf, all_features_scaled, all_classes, cv=10)

cv_scores.mean()

0.75404964806963037

## SVM



In [31]:
from sklearn import svm

C = 1.0
svc = svm.SVC(kernel='linear', C=C)

In [32]:
cv_scores = cross_val_score(svc,all_features_scaled, all_classes, cv=10)
cv_scores.mean()

0.79649888753620757

## KNN


In [33]:
from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(clf, all_features_scaled, all_classes, cv=10)

cv_score.mean()

0.73735569455522443

In [34]:
for n in range(1,50):
    clf = neighbors.KNeighborsClassifier(n_neighbors=n)
    cv_scores = cross_val_score(clf, all_features_scaled, all_classes, cv=10)
    print(n,cv_scores.mean())

1 0.723912374236
2 0.688983809804
3 0.75410806991
4 0.730081300813
5 0.773546450611
6 0.762616318934
7 0.794059513315
8 0.774708240628
9 0.788020024348
10 0.785479548857
11 0.79153338091
12 0.779425716805
13 0.781908470117
14 0.791503995074
15 0.787874844325
16 0.779441109385
17 0.781807368848
18 0.775681121699
19 0.780514741894
20 0.782866658271
21 0.785392790675
22 0.78173425409
23 0.780558820648
24 0.780587506822
25 0.787817122147
26 0.786626995788
27 0.785436519598
28 0.790227110533
29 0.786597959783
30 0.787831465234
31 0.791417236892
32 0.787831465234
33 0.786597609952
34 0.786611953039
35 0.786626296125
36 0.785435819935
37 0.786684368135
38 0.78665533213
39 0.787889187412
40 0.785479199026
41 0.785464506108
42 0.781850048277
43 0.78306921064
44 0.783054867554
45 0.783054867554
46 0.785464855939
47 0.786684368135
48 0.789065320516
49 0.790299525629


## Naive Bayes



In [35]:
from sklearn.naive_bayes import MultinomialNB

scaler = preprocessing.MinMaxScaler()
all_features_minmax = scaler.fit_transform(all_features)

clf = MultinomialNB()
cv_scores = cross_val_score(clf, all_features_minmax, all_classes, cv=10)

cv_scores.mean()

0.78440556651693882

## Revisiting SVM



In [36]:
C = 1.0
svc = svm.SVC(kernel='rbf', C=C)
cv_score = cross_val_score(svc, all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.80120237045743958

In [38]:
C = 1.0
svc = svm.SVC(kernel='sigmoid',C=C)
cv_score = cross_val_score(svc, all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.73510557911086849

In [39]:
C=1.0
svc = svm.SVC(kernel='poly',C=C)
cv_score = cross_val_score(svc,all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.79275394259966703

## Logistic Regression



In [40]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
cv_score=cross_val_score(clf, all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.80735835327372207

## Neural Networks



In [47]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

def create_model():
    model = Sequential()
    model.add(Dense(6, input_dim=4, kernel_initializer='normal', activation='relu'))
    
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [48]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

estimator = KerasClassifier(build_fn = create_model, epochs=100, verbose=0)

cv_score = cross_val_score(estimator, all_features_scaled, all_classes, cv=10)
cv_score.mean()

0.81325301290994667