<a href="https://colab.research.google.com/github/Keonapang/April2023/blob/main/A03_Breast_Cancer_Detection_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Important libraries (keras and scikit-learn)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten 
from keras.layers import Activation

In [None]:
# Loading data from the Breast Cancer Classification .csv file
data_raw = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/BCW_dataset.csv', delimiter=',', header=0, index_col=None) # Head method show first 5 rows of data
print(data_raw.head()) #prints only the first 5 rows for each 33 columns]

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [None]:
#Data Cleaning

# 1. Drop unused columns
drop_columns = ['Unnamed: 32', 'id', 'diagnosis']

# Convert Strings ('M' or 'B') -> Integers ('1' or '0')
d = {'M': 0, 'B': 1} #for mapping Malignant('M') = 0 and Benign ('B') = 1

# 2. Define features and labels
y = data_raw['diagnosis'].map(d) #map 'M' as 0 and 'B' as 1 to each letter at the 'diagnosis' column
X = data_raw.drop(drop_columns, axis=1) #drops 3 columns: 'Unnamed:33', 'ID' and 'diagnosis'
print(X.head()) #head.() prints only the first 5 rows

print(' ') #return a new space
print('X.shape: ',X.shape) #[569 rows x 30 feature columns]
print('y.shape: ',y.shape) #(569 rows, 1 column)

   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   fractal_dimension_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0           

In [None]:
# 3. testing: length of y.values
print(y) #column of 'Diagnosis' is now a column of 569 '0's and '1's
print(' ') #return a new space
print('All', len(y.values),'Diagnosis printed out (y.values):')
print(y.values) #print all 569 values

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
# New method to scale the input data

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

encode_array = OneHotEncoder()
y_2OP = encode_array.fit_transform(y[:, None]).toarray()
#print(y_2OP.shape) #(569, 2)

# Split the dataset into training (75%) and test (25%) 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_2OP, test_size=0.25, random_state=0)

In [None]:
# DO NOT RUN (another method)

# Normalization of the X dataset (X_train and X_test), make it between 0-1, more fair and comparable between features
X_train_n = (X_train-X_train.mean())/(X_train.max()-X_train.min()) # + and - values close to 1.0
X_test_n = (X_test-X_train.mean())/(X_test.max()-X_test.min()) 
#print(X_train_n.shape)    #a matrix of [455 rows x 30 columns]
#print(y_train.shape)      #a matrix of [455 rows x 1 column]
#print('Diagnosis of the patients 0, 1, 2:')
#print(y[0:3])              #testing: prints diagnosis of the patients 0, 1, 2 
#print('X_train:', X_train)
print('X_train.mean():', X_train.mean())
print('X_train.max()',X_train.max())
#print(X_train_n)
print('y.shape:', y.shape)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)

In [None]:
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

ANN model #1 and #2

In [None]:
# 1. Define ANN model 1 structure
# simple 1 hidden layer, 10 hidden nodes; 2 outputs with softmax
# using all 30 features as input 
num_hidden_nodes = 50 #can change 
num_output = 2
num_input = 30

def model_1():
    # create model
    model = Sequential()
    model.add(Dense(num_hidden_nodes, input_dim= num_input, activation='relu')) #Dense : a regular fully connected layer
    model.add(Dense(num_output, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Choice of optimizer: adam (adaptive moment estimation), AdaGrad (adaptive learning rate), 
    # sgd (Stochastic gradient descent), RMSprop (similar to AdaGrad), Adadelta (adaptive delta) ...
    return model

num_hidden_node1 = 40 #can change 
num_hidden_node2 = 40 #can change 
num_hidden_node3 = 40 #can change 
num_output = 2
num_input = 30

def model_2():
    # create model with 2 hidden nodes
    model = Sequential()
    model.add(Dense(num_hidden_node1, input_dim= num_input, activation='relu')) #Dense : a regular fully connected layer
    model.add(Dense(num_hidden_node2, activation='relu')) 
    model.add(Dense(num_hidden_node3, activation='relu')) 
    model.add(Dense(num_output, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Choice of optimizer: adam (adaptive moment estimation), AdaGrad (adaptive learning rate), 
    # sgd (Stochastic gradient descent), RMSprop (similar to AdaGrad), Adadelta (adaptive delta) ...
    return model

In [None]:
# build the model
model = model_2() #either model_1 or model_2
# Training the ANN model 1
history = model.fit(X_train, y_train, batch_size=8, epochs=60,verbose=2, validation_data=(X_test, y_test))
# If difference in validation and training curve is too big, it indicates overfitting. 

# Testing of the trained ANN with X_test
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# Plotting ANN model_1 or model_2: To show the performance of the ANN during training
# The model information has already been saved in "history"

metrics = history.history
plt.plot(history.epoch, metrics['accuracy'], metrics['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
# model.predict gives the ANN model 1 output with an input
y_ANN_output = model.predict(X_test[0,None])
print('y_ANN_output = ',y_ANN_output)

In [None]:
# To check the softmax output from the ANN model 1
check_sum = y_ANN_output[0]
print('check_sum = ',check_sum[0]+check_sum[1]+check_sum[2])

# To get the class label from Y_test
y_ref_result = Y_test[0].argmax().item()
print('y_ref_result = ',y_ref_result)

# To get the class label from ANN
y_ANN_result = y_ANN_output.argmax().item()
print('y_ANN_result = ',y_ANN_result)

ANN Model #3 and #4

 9 features:
worst concave points ; concave points error;  concavity error

Done: worst smoothness ; worst symmetry ; worst texture ; compactness error ; ; mean symmetry ; radius error 

In [None]:
#select the 9 features from X

print('X.shape: ',X.shape) #[569 rows x 30 feature columns]
keep_9_features = ['id', 'diagnosis', 'radius_mean','texture_mean', 
                   'perimeter_mean', 'area_mean', 'smoothness_mean', 
                   'compactness_mean', 'concavity_mean', 'concave points_mean',
                   'perimeter_worst', 'compactness_worst', 'concavity_worst',
                   'fractal_dimension_worst','Unnamed: 32', 'area_worst', 
                   'fractal_dimension_mean','symmetry_se', 'radius_worst',
                   'smoothness_se', 'area_se', 'texture_se', 'fractal_dimension_se',
                   'perimeter_se'] #write down the 21 columns you want to drop

XX = data_raw.drop(keep_9_features, axis=1) #drops 21 columns, keeps 9 features
print(XX)
print('XX.shape:', XX.shape)


X.shape:  (569, 14)
     symmetry_mean  radius_se  compactness_se  concavity_se  \
0           0.2419     1.0950         0.04904       0.05373   
1           0.1812     0.5435         0.01308       0.01860   
2           0.2069     0.7456         0.04006       0.03832   
3           0.2597     0.4956         0.07458       0.05661   
4           0.1809     0.7572         0.02461       0.05688   
..             ...        ...             ...           ...   
564         0.1726     1.1760         0.02891       0.05198   
565         0.1752     0.7655         0.02423       0.03950   
566         0.1590     0.4564         0.03731       0.04730   
567         0.2397     0.7260         0.06158       0.07117   
568         0.1587     0.3857         0.00466       0.00000   

     concave points_se  texture_worst  smoothness_worst  concave points_worst  \
0              0.01587          17.33           0.16220                0.2654   
1              0.01340          23.41           0.12380      

In [None]:
# New method to scale the input data

scaler = StandardScaler()
XX_scaled = scaler.fit_transform(XX)

encode_array = OneHotEncoder()
y_2OP = encode_array.fit_transform(y[:, None]).toarray()
#print(y_2OP.shape) #(569, 2)

# Split the dataset into training (75%) and test (25%) 
XX_train, XX_test, y_train, y_test = train_test_split(XX_scaled, y_2OP, test_size=0.25, random_state=0)

  y_2OP = encode_array.fit_transform(y[:, None]).toarray()


In [None]:
# 3. Define ANN model 3 and model 4 structure
# 1 hidden layer; 10 hidden nodes; 2 outputs with softmax
# 2 outputs: 0 for 'malignant' is denoted as [0 1]; 1 for 'Benign' is denoted as [1 0]
# using all 9 features

num_hidden_nodes = 50 #can change 
num_output = 2
num_input = 9 #features

def model_3():
    # create model
    model = Sequential()
    model.add(Dense(num_hidden_nodes, input_dim= num_input, activation='relu')) #Dense : a regular fully connected layer
    model.add(Dense(num_output, activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Choice of optimizer: adam (adaptive moment estimation), AdaGrad (adaptive learning rate), 
    # sgd (Stochastic gradient descent), RMSprop (similar to AdaGrad), Adadelta (adaptive delta) ...
    return model

#-----------------------------------------------------------------
num_hidden_node1 = 40 #can change 
num_hidden_node2 = 40 #can change 
num_hidden_node3 = 40 #can change 
num_output = 2
num_input = 9 #features

def model_4():
    # create model with 2 hidden nodes
    model = Sequential()
    model.add(Dense(num_hidden_node1, input_dim= num_input, activation='relu')) #Dense : a regular fully connected layer
    model.add(Dense(num_hidden_node2, activation='relu')) 
    model.add(Dense(num_hidden_node3, activation='relu')) 
    model.add(Dense(num_output, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Choice of optimizer: adam (adaptive moment estimation), AdaGrad (adaptive learning rate), 
    # sgd (Stochastic gradient descent), RMSprop (similar to AdaGrad), Adadelta (adaptive delta) ...
    return model

In [None]:
# build the model_3 or model_4
model = model_3() #either model_1 or model_2
# Training the ANN model 1
history = model.fit(XX_train, y_train, batch_size=8, epochs=60,verbose=2, validation_data=(XX_test, y_test))
# If difference in validation and training curve is too big, it indicates overfitting. 

# Testing of the trained ANN with X_test
score = model.evaluate(XX_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/60
54/54 - 1s - loss: 0.6408 - accuracy: 0.6362 - val_loss: 0.4576 - val_accuracy: 0.8182 - 864ms/epoch - 16ms/step
Epoch 2/60
54/54 - 0s - loss: 0.3729 - accuracy: 0.8897 - val_loss: 0.3169 - val_accuracy: 0.8741 - 118ms/epoch - 2ms/step
Epoch 3/60
54/54 - 0s - loss: 0.2643 - accuracy: 0.9202 - val_loss: 0.2554 - val_accuracy: 0.9021 - 164ms/epoch - 3ms/step
Epoch 4/60
54/54 - 0s - loss: 0.2082 - accuracy: 0.9390 - val_loss: 0.2171 - val_accuracy: 0.9231 - 134ms/epoch - 2ms/step
Epoch 5/60
54/54 - 0s - loss: 0.1741 - accuracy: 0.9531 - val_loss: 0.1939 - val_accuracy: 0.9441 - 138ms/epoch - 3ms/step
Epoch 6/60
54/54 - 0s - loss: 0.1517 - accuracy: 0.9531 - val_loss: 0.1771 - val_accuracy: 0.9580 - 122ms/epoch - 2ms/step
Epoch 7/60
54/54 - 0s - loss: 0.1357 - accuracy: 0.9554 - val_loss: 0.1661 - val_accuracy: 0.9580 - 118ms/epoch - 2ms/step
Epoch 8/60
54/54 - 0s - loss: 0.1231 - accuracy: 0.9577 - val_loss: 0.1579 - val_accuracy: 0.9580 - 124ms/epoch - 2ms/step
Epoch 9/60
54/5

Analyze test results from model #1 and #2 using confusion matrix, recall score and f1_score.

In [None]:
#confusion matrix 
actual = np.random.binomial(1, 0.9, size = 1000)
predicted = np.random.binomial(1, 0.9, size = 1000)
confusion_matrix = metrics.confusion_matrix(actual, predicted)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()
plt.show()

#accuracy
Accuracy = metrics.accuracy_score(actual, predicted)

#precision
Precision = metrics.precision_score(actual, predicted)

#recall score

ANN1_recall = metrics.recall_score(actual, predicted)


#F1-score is the harmonic mean of the precision and recall
ANN1_F1 = 2/((1/ANN1_precision)+(1/ANN1_recall))
#F1-score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity))



NameError: ignored

In [None]:
# To show the performance of the ANN during training

metrics = history.history
plt.plot(history.epoch, metrics['accuracy'], metrics['val_accuracy'])
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

Analyze test results of Model #3 and Model #4 using confusion matric, recall score and f1_score.