In [1]:
# import required libraries
import numpy as np
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation
import tensorflow as tf
tf.keras.utils.set_random_seed(1)

In [2]:
# loading files
from google.colab import files
uploaded = files.upload()

Saving BRCA.csv to BRCA.csv


In [3]:
# Reading csv into a dataframe and setting the index as the Patient ID
cancer_type_df = pd.read_csv('BRCA.csv').set_index('Patient_ID')
cancer_type_df.head()

Unnamed: 0_level_0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [4]:
# Dropping rows that have empty cells
cancer_type_df.dropna(inplace=True)

In [5]:
# Checking amount of values in the Histology column
cancer_type_df['Histology'].value_counts()

Infiltrating Ductal Carcinoma     224
Infiltrating Lobular Carcinoma     81
Mucinous Carcinoma                 12
Name: Histology, dtype: int64

In [6]:
# Checking data types of the dataframe
cancer_type_df.dtypes

Age                   float64
Gender                 object
Protein1              float64
Protein2              float64
Protein3              float64
Protein4              float64
Tumour_Stage           object
Histology              object
ER status              object
PR status              object
HER2 status            object
Surgery_type           object
Date_of_Surgery        object
Date_of_Last_Visit     object
Patient_Status         object
dtype: object

In [7]:
# setting and instance of One Hot Encoder
enc = OneHotEncoder(sparse = False)

In [8]:
# Dropping un-wanted columns in the dataframe
cancer_type_df.drop(columns= ['Gender', 'Tumour_Stage', 'Surgery_type', "Date_of_Surgery", 'Date_of_Last_Visit', 'Patient_Status'], inplace=True)
cancer_type_df.head()

Unnamed: 0_level_0,Age,Protein1,Protein2,Protein3,Protein4,Histology,ER status,PR status,HER2 status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-D8-A1XD,36.0,0.080353,0.42638,0.54715,0.27368,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
TCGA-EW-A1OX,43.0,-0.42032,0.57807,0.61447,-0.031505,Mucinous Carcinoma,Positive,Positive,Negative
TCGA-A8-A079,69.0,0.21398,1.3114,-0.32747,-0.23426,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
TCGA-D8-A1XR,56.0,0.34509,-0.21147,-0.19304,0.12427,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
TCGA-BH-A0BF,56.0,0.22155,1.9068,0.52045,-0.31199,Infiltrating Ductal Carcinoma,Positive,Positive,Negative


In [9]:
# creating a new dataframe for the features of the analysis
cancer_feat = cancer_type_df.copy().drop(columns = ['Histology'])
cancer_feat.head()

Unnamed: 0_level_0,Age,Protein1,Protein2,Protein3,Protein4,ER status,PR status,HER2 status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TCGA-D8-A1XD,36.0,0.080353,0.42638,0.54715,0.27368,Positive,Positive,Negative
TCGA-EW-A1OX,43.0,-0.42032,0.57807,0.61447,-0.031505,Positive,Positive,Negative
TCGA-A8-A079,69.0,0.21398,1.3114,-0.32747,-0.23426,Positive,Positive,Negative
TCGA-D8-A1XR,56.0,0.34509,-0.21147,-0.19304,0.12427,Positive,Positive,Negative
TCGA-BH-A0BF,56.0,0.22155,1.9068,0.52045,-0.31199,Positive,Positive,Negative


In [10]:
# creating a list containing the column names of object data types
object_variables = list(cancer_feat.dtypes[cancer_feat.dtypes == "object"].index)
object_variables

['ER status', 'PR status', 'HER2 status']

In [11]:
# Encoding the columns of object types
encoded_data = enc.fit_transform(cancer_feat[object_variables])

In [12]:
# creating a dataframe of the encoded columns
encoded_df = pd.DataFrame(encoded_data,columns= enc.get_feature_names_out(object_variables))
encoded_df.head()

Unnamed: 0,ER status_Positive,PR status_Positive,HER2 status_Negative,HER2 status_Positive
0,1.0,1.0,1.0,0.0
1,1.0,1.0,1.0,0.0
2,1.0,1.0,1.0,0.0
3,1.0,1.0,1.0,0.0
4,1.0,1.0,1.0,0.0


In [13]:
# dropping columns in that were encoded
cancer_feat.drop(columns=object_variables, inplace=True)
cancer_feat.reset_index(inplace=True)
cancer_feat.head()

Unnamed: 0,Patient_ID,Age,Protein1,Protein2,Protein3,Protein4
0,TCGA-D8-A1XD,36.0,0.080353,0.42638,0.54715,0.27368
1,TCGA-EW-A1OX,43.0,-0.42032,0.57807,0.61447,-0.031505
2,TCGA-A8-A079,69.0,0.21398,1.3114,-0.32747,-0.23426
3,TCGA-D8-A1XR,56.0,0.34509,-0.21147,-0.19304,0.12427
4,TCGA-BH-A0BF,56.0,0.22155,1.9068,0.52045,-0.31199


In [14]:
# Creating a the features dataframe using the encoded dataframe and original feature dataframe
X = pd.concat([cancer_feat, encoded_df], axis=1).set_index('Patient_ID')
X.head()

Unnamed: 0_level_0,Age,Protein1,Protein2,Protein3,Protein4,ER status_Positive,PR status_Positive,HER2 status_Negative,HER2 status_Positive
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-D8-A1XD,36.0,0.080353,0.42638,0.54715,0.27368,1.0,1.0,1.0,0.0
TCGA-EW-A1OX,43.0,-0.42032,0.57807,0.61447,-0.031505,1.0,1.0,1.0,0.0
TCGA-A8-A079,69.0,0.21398,1.3114,-0.32747,-0.23426,1.0,1.0,1.0,0.0
TCGA-D8-A1XR,56.0,0.34509,-0.21147,-0.19304,0.12427,1.0,1.0,1.0,0.0
TCGA-BH-A0BF,56.0,0.22155,1.9068,0.52045,-0.31199,1.0,1.0,1.0,0.0


In [15]:
# Setting the target variable
y = cancer_type_df['Histology']
y.value_counts()

Infiltrating Ductal Carcinoma     224
Infiltrating Lobular Carcinoma     81
Mucinous Carcinoma                 12
Name: Histology, dtype: int64

In [16]:
# Calculating the number of output and input nodes
number_of_classes = len(list(y.unique()))
number_of_predictors = len(X.columns)
print(number_of_classes)
print(number_of_predictors)

3
9


In [17]:
# Creating a Label Encoder instance and and transforming the target into an integer
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
encoded_y

array([0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0,
       0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,

In [18]:
# Transforming the encoded_y to binary
y_categorical = to_categorical(encoded_y, num_classes=number_of_classes)
y_categorical

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0

In [19]:
# Splitting the Features(X) and Target(y) into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, random_state = 1)

In [20]:
# Creating the Standard Scaler instance and scaling the Feature datasets
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
# Creating the Sequentail neural network instance
nn = Sequential()

In [22]:
# Creating the neural network layers
nn.add(Dense(25, input_dim = number_of_predictors, activation = 'relu'))
nn.add(Dense(25, activation = 'relu'))
nn.add(Dense(25, activation = 'tanh'))
nn.add(Dense(25, activation = 'tanh'))
nn.add(Dropout(.1))
nn.add(Dense(number_of_classes, activation='softmax'))

In [23]:
# Compiling the neural network using Categorical Crossentropy loss, the Adam optimizer, and Categorical Accuracy metric
nn.compile(loss="categorical_focal_crossentropy",
              optimizer= "adam",
              metrics=['categorical_accuracy'])

In [24]:
# Displaying the makeup of the neural network
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 25)                250       
                                                                 
 dense_1 (Dense)             (None, 25)                650       
                                                                 
 dense_2 (Dense)             (None, 25)                650       
                                                                 
 dense_3 (Dense)             (None, 25)                650       
                                                                 
 dropout (Dropout)           (None, 25)                0         
                                                                 
 dense_4 (Dense)             (None, 3)                 78        
                                                                 
Total params: 2278 (8.90 KB)
Trainable params: 2278 (8.9

In [25]:
# Training the neural network using 200 epochs
number_of_epochs = 200
nn.fit(X_train_scaled, y_train,
       epochs = number_of_epochs,
       shuffle = True)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x784b8a1295d0>

In [26]:
# Evaluating the training
nn.evaluate(X_test_scaled,y_test, verbose=2)

3/3 - 0s - loss: 0.1370 - categorical_accuracy: 0.6875 - 183ms/epoch - 61ms/step


[0.13699913024902344, 0.6875]

In [27]:
# Using the testing data to predict on the model
predictions = nn.predict(X_test)
predictions



array([[9.99649048e-01, 2.41705784e-04, 1.09330649e-04],
       [9.99476492e-01, 1.76916699e-04, 3.46428336e-04],
       [9.99517262e-01, 1.98651891e-04, 2.84022128e-04],
       [9.99715924e-01, 2.40918307e-04, 4.31969092e-05],
       [9.99609888e-01, 2.57604057e-04, 1.32498288e-04],
       [9.99596179e-01, 2.62785965e-04, 1.41076176e-04],
       [9.99692202e-01, 2.28720965e-04, 7.91013881e-05],
       [9.99616563e-01, 2.32521110e-04, 1.51008557e-04],
       [9.99692082e-01, 2.36891370e-04, 7.10082677e-05],
       [9.99680877e-01, 2.13382198e-04, 1.05761683e-04],
       [9.99425530e-01, 1.80860079e-04, 3.93544906e-04],
       [9.99481976e-01, 2.16763030e-04, 3.01207794e-04],
       [9.99577701e-01, 2.22487506e-04, 1.99816117e-04],
       [9.99621570e-01, 2.53750157e-04, 1.24794882e-04],
       [9.99563158e-01, 2.22080213e-04, 2.14688669e-04],
       [9.99687910e-01, 2.51499878e-04, 6.06539106e-05],
       [9.99794483e-01, 1.94198190e-04, 1.14558525e-05],
       [9.99678731e-01, 2.45979

In [28]:
# Get the most likely prediction for each observation
most_likely = np.argmax(predictions, axis=1)
most_likely

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
# transforming the results back to their original values
most_likely = encoder.inverse_transform((most_likely))
most_likely

array(['Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Ductal Carcinoma', 'Infiltrati

In [30]:
# Displaying count of predictions
pd.DataFrame(most_likely).value_counts()

Infiltrating Ductal Carcinoma    80
dtype: int64