# 1. Import and data loading

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv("data/breast_cancer_data.csv")
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
data.drop(columns=["Unnamed: 32", "id"], inplace=True, errors="ignore")
data.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [5]:
data.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [6]:
diagnosis = np.where(data["diagnosis"] == "M", 1, 0)
data.drop(columns="diagnosis", inplace=True, errors="ignore")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, diagnosis, test_size=0.3, random_state=0
)

In [8]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_test = np_utils.to_categorical(y_test)
y_train = np_utils.to_categorical(y_train)

# Building the model

In [9]:
nb_classes = 2
input_dim = X_train.shape[1]
input_dim
model = Sequential()
model.add(Dense(128, input_dim=input_dim))
model.add(Activation("relu"))
model.add(Dropout(0.15))
model.add(Dense(nb_classes))
model.add(Activation("softmax"))

2023-02-04 20:54:32.417577: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="rmsprop",
    metrics=["accuracy", "binary_accuracy"],
)

In [11]:
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=2)

Epoch 1/10
23/23 - 0s - loss: 0.6161 - accuracy: 0.7318 - binary_accuracy: 0.7318 - val_loss: 0.5459 - val_accuracy: 0.7000 - val_binary_accuracy: 0.7000 - 402ms/epoch - 17ms/step
Epoch 2/10
23/23 - 0s - loss: 0.5027 - accuracy: 0.8492 - binary_accuracy: 0.8492 - val_loss: 0.4422 - val_accuracy: 0.8500 - val_binary_accuracy: 0.8500 - 29ms/epoch - 1ms/step
Epoch 3/10
23/23 - 0s - loss: 0.4191 - accuracy: 0.8994 - binary_accuracy: 0.8994 - val_loss: 0.3632 - val_accuracy: 0.9250 - val_binary_accuracy: 0.9250 - 48ms/epoch - 2ms/step
Epoch 4/10
23/23 - 0s - loss: 0.3559 - accuracy: 0.8994 - binary_accuracy: 0.8994 - val_loss: 0.3120 - val_accuracy: 0.9250 - val_binary_accuracy: 0.9250 - 49ms/epoch - 2ms/step
Epoch 5/10
23/23 - 0s - loss: 0.3094 - accuracy: 0.8966 - binary_accuracy: 0.8966 - val_loss: 0.2657 - val_accuracy: 0.9000 - val_binary_accuracy: 0.9000 - 39ms/epoch - 2ms/step
Epoch 6/10
23/23 - 0s - loss: 0.2680 - accuracy: 0.9106 - binary_accuracy: 0.9106 - val_loss: 0.2468 - val_a

<keras.callbacks.History at 0x7f18b1f5f790>

In [12]:
model.evaluate(X_test, y_test)



[0.20564697682857513, 0.9298245906829834, 0.9298245906829834]

In [13]:
import shap

In [14]:
def f(X):
    return model.predict(X)

In [16]:
explainer = shap.KernelExplainer(f, X_test)
shap_values = explainer.shap_values(X_test, n_samples=20)



Using 171 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/171 [00:00<?, ?it/s]



In [None]:
shap.force_plot(y_test, shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test)