In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive') # ignore this part if you are running notebook file in your local system

Mounted at /content/drive


In [None]:
### Enter the location of csv file (containing selected markers) in read_csv().

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AI-Project/RF_markers.csv") # needs to be replaced by the correct file path in your system


In [None]:
df_sort = df.sort_values(by=['is_tumor'])
df_sort.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,554,555,556,557,558,559,560,561,562,is_tumor
617,617,2.087917,1.615181,-0.212938,0.729409,-0.425896,-0.141797,-0.283376,-0.480681,0.39783,...,1.470483,0.245802,-0.608301,2.080585,-0.39453,1.545422,0.766162,-0.88862,-0.090073,0
211,211,0.965989,-0.021188,-0.197933,0.290594,-2.25778,-0.863016,-0.200669,-0.216094,2.252804,...,2.524498,0.512483,-0.358858,-0.458391,-0.401995,0.894254,0.777557,-1.002762,-0.12184,0
424,424,1.643422,1.655723,-0.156179,0.428711,-1.730505,-0.227755,-0.109869,0.19851,0.98731,...,2.887171,-0.419675,-0.137824,0.50771,-0.58558,1.417285,0.83044,-1.009261,-0.076982,0
425,425,1.352681,1.121116,-0.186235,0.021524,-1.786268,-0.086578,-0.099817,-0.243121,0.25377,...,3.44531,-2.115348,-0.279883,0.141134,-0.607837,1.424497,0.797358,-0.94378,-0.042684,0
426,426,-0.015998,-0.075882,-0.245968,-1.294631,-1.31551,-0.053413,-0.389943,-0.009704,-0.13028,...,1.681226,-0.613335,0.468243,0.023489,-0.594191,1.204425,0.800921,-1.022656,-0.161653,0


In [None]:
# Split DataFrame into pos and neg DataFrames
tumor_counts = df_sort['is_tumor'].value_counts()
print(f"Tumor Counts:\n",tumor_counts)
df_neg = df_sort.iloc[:tumor_counts[0],:]
df_pos = df_sort.iloc[tumor_counts[0]:,:]
print(df_neg.shape)
print(df_pos.shape)

Tumor Counts:
 0    309
1    309
Name: is_tumor, dtype: int64
(309, 565)
(309, 565)


In [None]:
# Randomly shuffle the dataframe (keep all)
df_neg = df_neg.sample(frac=1)
df_pos = df_pos.sample(frac=1)

In [None]:
# Remove extras records to equalize number of pos and neg in both dataframes
df_extra = None
if tumor_counts[0] > tumor_counts[1]:
	df_extra = df_neg.iloc[tumor_counts[1]:,:]
	df_neg = df_neg.iloc[:tumor_counts[1],:]
elif tumor_counts[1] >  tumor_counts[0]:
	df_extra = df_pos.iloc[tumor_counts[0]:,:]
	df_pos = df_pos.iloc[:tumor_counts[0],:]

In [None]:
# Recombine positive and negative dataframes and split into testing and training dataframes
testing_split = 0.3
total_testing_dataset = int(tumor_counts[1] * testing_split)

df_testing = pd.concat([df_neg.iloc[:total_testing_dataset],df_pos.iloc[:total_testing_dataset]])
# Add extra records to testing dataset
if df_extra:
  df_testing = pd.concat([df_testing,df_extra])
df_testing = df_testing.sample(frac=1)
df_testing = df_testing.reset_index()

df_training = pd.concat([df_neg.iloc[total_testing_dataset:], df_pos.iloc[total_testing_dataset:]])
df_training = df_training.sample(frac=1)
df_training = df_training.reset_index()


In [None]:
#Split label from features and drop superflous attributes

df_trainingY = df_training['is_tumor'].copy()
try:
     df_trainingX = df_training.drop(columns=['index', 'Donor_Sample', 'Unnamed: 0', 'is_tumor'])
except KeyError:
     df_trainingX = df_training.drop(columns=['index', 'Unnamed: 0', 'is_tumor'])

df_testingY = df_testing['is_tumor'].copy()
try:
     df_testingX = df_testing.drop(columns=['index', 'Donor_Sample', 'Unnamed: 0', 'is_tumor'])
except KeyError:
     df_testingX = df_testing.drop(columns=['index', 'Unnamed: 0', 'is_tumor'])

print("-------")
print(df_trainingY.head())
print("-----")
print(df_trainingX.head())
print("-----")
print(df_testingY.head())
print("-----")
print(df_testingX.head())
print("-------")


-------
0    1
1    1
2    1
3    1
4    0
Name: is_tumor, dtype: int64
-----
          0         1         2         3         4         5         6  \
0 -0.541180 -0.748827 -0.236925  0.073502  0.002885  0.602964  1.048108   
1 -1.534798 -0.501645 -0.215783 -0.323012  0.188556  0.319748  0.308429   
2 -1.445897 -0.588148 -0.144208 -0.645639 -0.103541  0.246509  1.170128   
3  0.251015  0.679391 -0.312610  0.308737  0.362620 -0.188829 -1.380069   
4  1.643422  1.655723 -0.156179  0.428711 -1.730505 -0.227755 -0.109869   

          7         8         9  ...       553       554       555       556  \
0 -0.040787  2.181503  0.145418  ...  2.012345 -0.127225 -1.445843 -0.296072   
1  0.016857 -0.332358  0.219177  ...  2.350636 -0.741703 -0.549907 -0.563960   
2  0.111138 -0.138447  0.257094  ...  0.835243 -0.070278 -0.498214 -0.273051   
3  1.682026  2.599195  0.467940  ... -1.017155  1.912749  1.021623  1.508217   
4  0.198510  0.987310 -2.482858  ... -0.667000  2.887171 -0.419675 -0.1

In [None]:
### Building Sequential Model 

# Total no. of features
total_features = len(df_trainingX.columns)

# Dimensionality of output space of layers
DIMENSIONS = [10, 20, 30, 20]
# DIMENSIONS = [50, 20, 10, 5]
# DIMENSIONS = [30, 10, 15]
model = Sequential()

# Input Layer
model.add(Dense(DIMENSIONS[0], input_dim=total_features, activation="relu"))
model.add(Dropout(0.25))

# Hidden Layer 1
model.add(Dense(DIMENSIONS[1], activation="relu"))
model.add(Dropout(0.25))

# Hidden Layer 2
model.add(Dense(DIMENSIONS[2], activation="relu"))
model.add(Dropout(0.25))

# Hidden Layer 3
model.add(Dense(DIMENSIONS[3], activation="relu"))
model.add(Dropout(0.25))

### Output Layer
# Using sigmoid activation function for binary classification
model.add(Dense(1, activation="sigmoid"))

### Control weight assignments via optimizers 
opt = Adam(learning_rate=0.05)
# opt = SGD(learning_rate=0.01, momentum=0.9)

model.compile(loss="binary_crossentropy", optimizer=opt, metrics=['accuracy'])
# model.compile(loss='hinge', optimizer=opt, metrics=['accuracy'])
# model.compile(loss='squared_hinge', optimizer=opt, metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                5640      
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 20)                220       
                                                                 
 dropout_1 (Dropout)         (None, 20)                0         
                                                                 
 dense_2 (Dense)             (None, 30)                630       
                                                                 
 dropout_2 (Dropout)         (None, 30)                0         
                                                                 
 dense_3 (Dense)             (None, 20)                6

In [None]:
# Early stopping and learning rate reduction

earlystop = EarlyStopping(patience=20)
learning_rate_reduction = ReduceLROnPlateau(monitor="val_accuracy",
                                            patience=5,
                                            verbose=1,
                                            factor=0.5,
                                            min_lr=0.00001)
callback = [earlystop, learning_rate_reduction]

In [None]:
# Fit Model and save weights 
OUTPUT_NAME = "only_RF_markers"
epochs = 10
start = time.time()
history = model.fit(x=df_trainingX,
                    y=df_trainingY,
                    epochs=epochs,
                    validation_split=0.3,
                    callbacks=callback,
                    verbose=1)
stop = time.time()
print(f"Training time: {stop - start}s")
model.save_weights("/content/drive/MyDrive/Colab Notebooks/AI-Project/dl/" + OUTPUT_NAME + ".h5") # needs to be replaced by the correct file path in your system

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training time: 2.9272773265838623s


In [None]:
# Save history data as a CSV file
history_file = "history_" + OUTPUT_NAME + ".csv"

df_history = pd.DataFrame()
df_history['loss'] = history.history['loss']
df_history['val_loss'] = history.history['val_loss']
df_history['accuracy'] = history.history['accuracy']
df_history['val_accuracy'] = history.history['val_accuracy']
df_history = df_history.transpose()
df_history.to_csv("/content/drive/MyDrive/Colab Notebooks/AI-Project/dl/" + history_file) # needs to be replaced by the correct file path in your system

In [None]:
# Parse prediction label
predict_classes = (model.predict(df_testingX) > 0.5).astype("int32")



In [None]:
# Save statistical data as CSV file
statistics_file = "statistics_" + OUTPUT_NAME + ".csv"

accuracy = accuracy_score(df_testingY, predict_classes)
precision = precision_score(df_testingY, predict_classes)
recall = recall_score(df_testingY, predict_classes)
f1 = f1_score(df_testingY, predict_classes)
cohen_kappa = cohen_kappa_score(df_testingY, predict_classes)
roc_auc = roc_auc_score(df_testingY, predict_classes)

df_stats = pd.DataFrame()
df_stats['Accuracy'] = [accuracy]
df_stats['Precision'] = [precision]
df_stats['Recall'] = [recall]
df_stats['F1 Score'] = [f1]
df_stats['Cohen Kappa'] = [cohen_kappa]
df_stats['ROC AUC'] = [roc_auc]
df_stats.to_csv("/content/drive/MyDrive/Colab Notebooks/AI-Project/dl/" + statistics_file) # needs to be replaced by the correct file path in your system
df_stats

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Cohen Kappa,ROC AUC
0,0.951087,1.0,0.902174,0.948571,0.902174,0.951087
