In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
import pandas as pd

## 1. Data Preparation

In [2]:
# Import and load the dataset
# Load the data
data = load_breast_cancer()
X, y = load_breast_cancer(return_X_y=True)

In [3]:
# Have a look at the data
features_df = pd.DataFrame(X)
features_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
108,22.27,19.67,152.8,1509.0,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,...,28.4,28.01,206.8,2360.0,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
321,20.16,19.66,131.1,1274.0,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,...,23.06,23.03,150.2,1657.0,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933
354,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.169,0.06083,...,12.12,15.82,79.62,453.5,0.08864,0.1256,0.1201,0.03922,0.2576,0.07018
232,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,...,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
42,19.07,24.81,128.3,1104.0,0.09081,0.219,0.2107,0.09961,0.231,0.06343,...,24.09,33.17,177.4,1651.0,0.1247,0.7444,0.7242,0.2493,0.467,0.1038
451,19.59,25.0,127.7,1191.0,0.1032,0.09871,0.1655,0.09063,0.1663,0.05391,...,21.44,30.96,139.8,1421.0,0.1528,0.1845,0.3977,0.1466,0.2293,0.06091
339,23.51,24.27,155.1,1747.0,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,...,30.67,30.73,202.4,2906.0,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738
327,12.03,17.93,76.09,446.0,0.07683,0.03892,0.001546,0.005592,0.1382,0.0607,...,13.07,22.25,82.74,523.4,0.1013,0.0739,0.007732,0.02796,0.2171,0.07037
52,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,...,13.1,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408
520,9.295,13.9,59.96,257.8,0.1371,0.1225,0.03332,0.02421,0.2197,0.07696,...,10.57,17.84,67.84,326.6,0.185,0.2097,0.09996,0.07262,0.3681,0.08982


In [4]:
target_df = pd.DataFrame(y)
target_df.sample(10)

Unnamed: 0,0
115,1
498,0
319,1
527,1
564,0
113,1
27,0
421,1
268,1
521,0


In [5]:
data['feature_names']

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [6]:
features_df.columns = data['feature_names']

In [7]:
features_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [10]:
target_df.columns = ['has_cancer']

In [11]:
target_df.head()

Unnamed: 0,has_cancer
0,0
1,0
2,0
3,0
4,0


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
# clean data ready to split, let's do that
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## 2. Modelling

Now we want to build a Neural Network to perform this regression task. We will build a Neural Networks with 5 hidden layers of 100 units each.

In [14]:
import tensorflow as tf

In [15]:
# we first import necessary layers and sequential model from keras API
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Sequential

In [17]:
# Build model architecture
def model(n_features, hl_activation):
    model = Sequential(
        [
            Input(shape=(n_features,)),
            Dense(100, activation=hl_activation[0]),
            Dense(100, activation=hl_activation[1]),
            Dense(100, activation=hl_activation[2]),
            Dense(100, activation=hl_activation[3]),
            Dense(100, activation=hl_activation[4]),
            Dense(1, activation='sigmoid')
        ]
    )
    return model

In [19]:
# some hidden-layers activation functions i'm testing
hl_activation = ['tanh', 'relu', 'tanh', 'relu', 'tanh']

In [20]:
n_features = X_train.shape[1]

# Initialize the model
first_dnn_model = model(n_features, hl_activation)

# Get model summary to check defined layers and number of parameters
first_dnn_model.summary()

2024-08-29 00:22:30.968585: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-08-29 00:22:30.968610: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-29 00:22:30.968619: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-29 00:22:30.968634: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-29 00:22:30.968647: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Now let's compile the model and then fit it

In [21]:
# Important metrics for classification task
from tensorflow.keras.metrics import Precision, Recall, AUC

In [22]:
# Compile the model with binary cross-entropy (classification) and some metrics
first_dnn_model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall(), AUC()])

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Now fit the model on 500 epoches with a batch size of 64
history = first_dnn_model.fit(x=X_train, y=y_train, validation_split=.3, epochs=500, batch_size=64)

# You can add the test/validation set into the fit: it will give insights on this dataset too

Epoch 1/500


2024-08-29 00:25:33.139957: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 552ms/step - accuracy: 0.4272 - auc: 0.6207 - loss: 0.7119 - precision: 0.8710 - recall: 0.1020 - val_accuracy: 0.7153 - val_auc: 0.9416 - val_loss: 0.6315 - val_precision: 0.9600 - val_recall: 0.5647
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7674 - auc: 0.8937 - loss: 0.6264 - precision: 0.8896 - recall: 0.7005 - val_accuracy: 0.9635 - val_auc: 0.9814 - val_loss: 0.5634 - val_precision: 0.9545 - val_recall: 0.9882
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.9168 - auc: 0.9731 - loss: 0.5628 - precision: 0.9033 - recall: 0.9734 - val_accuracy: 0.9562 - val_auc: 0.9872 - val_loss: 0.5091 - val_precision: 0.9438 - val_recall: 0.9882
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.9162 - auc: 0.9860 - loss: 0.5171 - precision: 0.8913 - recall: 0.9880 - val_accuracy: 0.94

Once the model's ready you can get training results and evaluate it on test set.

In [23]:
# Evaluation
loss, accuracy, precision, recall, AUC_coeff = first_dnn_model.evaluate(X_test, y_test, verbose=0)
print('loss is:', loss)
print('accuracy is:', accuracy)
print('Precision is:', precision)
print('Recall is:', recall)
print('AUC is:', AUC_coeff)

loss is: 0.12888556718826294
accuracy is: 0.9736841917037964
Precision is: 0.9599999785423279
Recall is: 1.0
AUC is: 0.9745370149612427


Great performance ! But could we just use simple machine learning algorithms such as logistic regression to perform this task ? Let's see that :

In [24]:
# Initialize a logistic regression model (you can add a random state if you want)
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [25]:
# Now quickly we can fit and evaluate the model
from sklearn.metrics import accuracy_score, classification_report
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

print("LR accuracy:", accuracy_score(lr_pred, y_test))
print("LR classification report:\n", classification_report(lr_pred, y_test))

LR accuracy: 0.9736842105263158
LR classification report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96        41
           1       0.99      0.97      0.98        73

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



The deep neural net is little bit more powerful than the simple logistic regression classifier. Keep in mind that the choice of a suitable model depends on the scale of the project, the dataset, the inference intended by the user and much more other factors.