# Imports

In [1]:
# imports
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder




# Data Collection

In [2]:
# read in csv
bc_data = pd.read_csv('BRCA.csv')
bc_data

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.273680,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.420320,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.213980,1.31140,-0.32747,-0.234260,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.345090,-0.21147,-0.19304,0.124270,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.221550,1.90680,0.52045,-0.311990,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,,,,,,,,,,,,,,,,
337,,,,,,,,,,,,,,,,
338,,,,,,,,,,,,,,,,
339,,,,,,,,,,,,,,,,


# Data Processing

In [3]:
#check NaN counts
bc_data.isna().sum()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

In [4]:
#drop NaNs
bc_data.dropna(inplace=True)
bc_data.tail()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
329,TCGA-AN-A04A,36.0,FEMALE,0.2318,0.61804,-0.55779,-0.51735,III,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Simple Mastectomy,11-Nov-19,09-Feb-20,Dead
330,TCGA-A8-A085,44.0,MALE,0.73272,1.1117,-0.26952,-0.35492,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Other,01-Nov-19,04-Mar-20,Dead
331,TCGA-A1-A0SG,61.0,FEMALE,-0.71947,2.5485,-0.15024,0.33968,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,11-Nov-19,18-Jan-21,Dead
332,TCGA-A2-A0EU,79.0,FEMALE,0.4794,2.0559,-0.53136,-0.18848,I,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,21-Nov-19,19-Feb-21,Dead
333,TCGA-B6-A40B,76.0,FEMALE,-0.24427,0.92556,-0.41823,-0.067848,I,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,11-Nov-19,05-Jan-21,Dead


In [5]:
#drop rows that will not help us determine cancer type and 'surgery type' because we are trying to figure out the type of cancer pre surgery.

bc_data_adjusted = bc_data.drop(columns = ['Patient_ID', 'Date_of_Surgery', 'Date_of_Last_Visit', 'Patient_Status', 'Surgery_type'])
bc_data_adjusted.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status
0,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
1,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative
2,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
3,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative
4,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative


In [6]:
#Used the following source: https://www.geeksforgeeks.org/replace-characters-in-strings-in-pandas-dataframe/
#Change the Histology column to be numbers so that we can use it for y as one column.
bc_data_adjusted['Histology'] = bc_data_adjusted['Histology'].str.replace('Infiltrating Ductal Carcinoma', '0.0')
bc_data_adjusted['Histology'] = bc_data_adjusted['Histology'].str.replace('Mucinous Carcinoma', '1.0')
bc_data_adjusted['Histology'] = bc_data_adjusted['Histology'].str.replace('Infiltrating Lobular Carcinoma', '2.0')

bc_data_adjusted.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status
0,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,0.0,Positive,Positive,Negative
1,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,1.0,Positive,Positive,Negative
2,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,0.0,Positive,Positive,Negative
3,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,0.0,Positive,Positive,Negative
4,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,0.0,Positive,Positive,Negative


In [7]:
#convert Histology from object to int
bc_data_adjusted['Histology'] = bc_data_adjusted['Histology'].astype(float)

In [8]:
#check the data types
bc_data_adjusted.dtypes

Age             float64
Gender           object
Protein1        float64
Protein2        float64
Protein3        float64
Protein4        float64
Tumour_Stage     object
Histology       float64
ER status        object
PR status        object
HER2 status      object
dtype: object

In [9]:
# Create a list of categorical variables 
categorical_variables = list(bc_data_adjusted.dtypes[bc_data_adjusted.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['Gender', 'Tumour_Stage', 'ER status', 'PR status', 'HER2 status']

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Encode categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(bc_data_adjusted[categorical_variables])



In [11]:
# Create a DataFrame with the encoded variables
# The column names should match those of the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names_out(categorical_variables)
)

# Display the DataFrame
encoded_df.head()

Unnamed: 0,Gender_FEMALE,Gender_MALE,Tumour_Stage_I,Tumour_Stage_II,Tumour_Stage_III,ER status_Positive,PR status_Positive,HER2 status_Negative,HER2 status_Positive
0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [12]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = bc_data_adjusted.drop(columns = categorical_variables)

# Review the DataFrame
numerical_variables_df.head()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4,Histology
0,36.0,0.080353,0.42638,0.54715,0.27368,0.0
1,43.0,-0.42032,0.57807,0.61447,-0.031505,1.0
2,69.0,0.21398,1.3114,-0.32747,-0.23426,0.0
3,56.0,0.34509,-0.21147,-0.19304,0.12427,0.0
4,56.0,0.22155,1.9068,0.52045,-0.31199,0.0


In [13]:
# Using the Pandas concat function, combine the DataFrames the contain the encoded categorical data and the numerical data
final_df = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)

# Reveiw the DataFrame
final_df.head()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4,Histology,Gender_FEMALE,Gender_MALE,Tumour_Stage_I,Tumour_Stage_II,Tumour_Stage_III,ER status_Positive,PR status_Positive,HER2 status_Negative,HER2 status_Positive
0,36.0,0.080353,0.42638,0.54715,0.27368,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
1,43.0,-0.42032,0.57807,0.61447,-0.031505,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2,69.0,0.21398,1.3114,-0.32747,-0.23426,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
3,56.0,0.34509,-0.21147,-0.19304,0.12427,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
4,56.0,0.22155,1.9068,0.52045,-0.31199,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0


# Separate and Scale Data

In [14]:
#define the target set
y = final_df['Histology']
#define the features set
X = final_df.drop(columns = ['Histology'])

In [15]:
# Split the data into training and testing datasets
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create / Deploy the Model

In [17]:
# Define the the number of inputs (features) to the model
number_input_features = len(X_train.iloc[0])

# Define the number of hidden nodes for the first hidden layer
# Use the mean of the number of input features plus the number of output nurons
# Use the Python floor division (//) to return the quotent
hidden_nodes_layer1 =  (number_input_features + 1) // 2 

# Define the number of hidden nodes for the second hidden layer
# Use the mean of the number of hidden nodes in the first hidden layer plus the number of output nurons
# Use the Python floor division (//) to return the quotent
hidden_nodes_layer2 = (hidden_nodes_layer1 + 1) // 2

# Define the number of hidden nodes for the third hidden layer
hidden_nodes_layer3 = (hidden_nodes_layer2 + 1) // 2

# Define the number of hidden nodes for the fourth hidden layer
hidden_nodes_layer4 = (hidden_nodes_layer3 + 1) // 2

# Create the Sequential model instance
nn = Sequential()

# Add the first hidden layer specifying the number of inputs, the number of hidden nodes, and the activation function
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Add the second hidden layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# Add the third hidden layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=hidden_nodes_layer3, activation="relu"))

# Add the fourth hidden layer specifying the number of hidden nodes and the activation function
nn.add(Dense(units=hidden_nodes_layer4, activation="relu"))

# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=1, activation="relu"))




In [18]:
# Display the Sequential model summary
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 7)                 105       
                                                                 
 dense_1 (Dense)             (None, 4)                 32        
                                                                 
 dense_2 (Dense)             (None, 2)                 10        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
 dense_4 (Dense)             (None, 1)                 2         
                                                                 
Total params: 152 (608.00 Byte)
Trainable params: 152 (608.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
# Compile the Sequential model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])




In [20]:
# Fit the model using 100 epochs and the training data
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Evaluate the Model

In [21]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: nan - accuracy: 0.6429 - 305ms/epoch - 102ms/step
Loss: nan, Accuracy: 0.6428571343421936


In [29]:
y_test.sample(78)

275    0.0
53     0.0
67     0.0
61     0.0
162    0.0
      ... 
87     1.0
115    2.0
194    0.0
6      0.0
270    0.0
Name: Histology, Length: 78, dtype: float64

In [25]:
y_test.value_counts() 

Histology
0.0    54
2.0    20
1.0     4
Name: count, dtype: int64

In [31]:
display(y_test)

61     0.0
281    2.0
186    0.0
200    2.0
300    0.0
      ... 
0      0.0
236    0.0
216    0.0
64     2.0
240    0.0
Name: Histology, Length: 84, dtype: float64

In [32]:
y_test.count()

78

In [33]:
y_test.isna().sum()

6