# Artificial neural nework training
Following parameters described in Evo2 preprint (section 4.3.16 BRCA supervised classiication).

* Training method: input vector derived from concatenation of the Reference (8192 elements) and Variant vectors (8192 elements).

* Training samples: 4800 by 16384 embeddings, one binary output (0 or 1)
	* 2400 items are labelled 0, which have Variant vector  0.1%, 0.2% or 0.3% of elements different from Reference vector.
    * 2400 items are labelled 1, which have Variant vector 2%, 3% or 4% of elements different from Reference vector.

* Testing samples: 600 by 16384, one binary output (0 or 1)
    * 300 items are labelled 0, which have Variant vector 0.1%, 0.2% or 0.3% of elements different from Reference vector.
    * 300 items are labelled 1, which have Variant vector 2%, 3% or 4% of elements different from Reference vector.



In [None]:
import os
import numpy as np
import pandas as pd
import random
import time
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.layers import Dropout

In [None]:
# READ IN CSV
DIR = "/mnt/nfs/rigenenfs/shared_resources/biobanks/UKBIOBANK/pangk/evo2/NN"
out_train = os.path.join(DIR, "train_vectors2.csv")
out_test = os.path.join(DIR, "test_vectors2.csv")

new_train_values = np.loadtxt(out_train,delimiter=",")
print(new_train_values.shape)
new_train = new_train_values.reshape((4800,16385))
print('new_train.shape = ',new_train.shape) # new_train.shape =  (4800, 16385)

new_test_values = np.loadtxt(out_test,delimiter=",")
print(new_test_values.shape)
new_test = new_test_values.reshape((600,16385))
print('new_test.shape = ',new_test.shape) # new_test.shape =  (600, 16385)

print(new_train[:5,:5])  # Display first 5 rows and columns
print(new_test[:5,:5])  # Display first 5 rows and columns

(4800, 16385)
new_train.shape =  (4800, 16385)
(600, 16385)
new_test.shape =  (600, 16385)


In [None]:
# 16384 inputs; one binary output; 1500 rows/training cases
# 8192*0.04 = 327.8 elements ----> multiply by 1.4
# take absolute difference of the two vectors, sum it up, then divide by a constant to scale it down to between 0 and 1
# input dimension: 16,385 

train_x = new_train[:,0:16384] 
train_y = new_train[:,16384]
print(train_x.shape) # (4800, 16384)
print(train_y.shape) # (4800,)

df_y = pd.DataFrame({
    "output_vector": train_y
})
print(df_y.shape) # (4800, 1)
df_y.head()

#===================
test_x = new_test[:,0:16384] 
test_y = new_test[:,16384]
print(test_x.shape) # (600, 16384)
print(test_y.shape) # (600,)

# -------------- Scaling ------------------
train_x = train_x*100 # scaling the elements 
print(np.max(train_x))
test_x = test_x*100 # scaling the elements
print(np.max(test_x))

(4800, 16384)
(4800,)
(4800, 1)
(600, 16384)
(600,)


In [None]:
from keras.losses import binary_crossentropy

# Compute Binary Cross-Entropy using NumPy, closer to 0 is better, 0 means the predicted vector and test vector is exactly the same
def binary_cross_entropy_np(y_true, y_pred):
    """
    Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0), reference vector (0 or 1) needs to compare with positive. Ensure the probability is within 0 or 1 (feasible calc.)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

# Compute Binary Cross-Entropy using own calculaton for checking
def binary_cross_entropy_check(y_true, y_pred):
    """
    Objective: Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    N=len(y_true)
    BCE_sum=0
    for i in range(N): # loop through each element 
        BCE_sum = BCE_sum + y_true[i]*np.log(y_pred[i]) + (1-y_true[i])*np.log((1-y_pred[i])) 
    BCE = (-1*BCE_sum)/N
    return BCE

## Start ANN training
The results from the previous ANN using MLPRegressor is very poor !

In [None]:
def baseline_model(): # 3 hiddenlayers, 512, 128, 32 hidden units (neurons); 8389120   parameters
    model = Sequential()
    model.add(Dense(512, activation='relu', input_dim = 16384)) # hidden layer 1 (512 neurons)
    model.add(BatchNormalization()) # 
    model.add(Dropout(0.3)) #
    model.add(Dense(128, activation='relu')) # hidden layer 2
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu')) # hidden layer 3
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid')) # output layer for binary classification | linear alternatre
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model

def baseline_model2(): # 2 hidden layers, 32 hidden units --> perhaps helps reduce overfitting and increase model performance?
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim = 16384)) # 32 num parameters
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(8, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid')) # output layer for binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
print('train_x.shape is ', train_x.shape) # (4800, 16384)

# Build model
ANN_model = baseline_model() # (16384 * 512) + 512 = 8,389,120 parameters
ANN_model.summary()

# Dropout step has NO "parameters". dropout(0.3) means 30% of the neurons are randomly set to 0 during training, not used in inference. Drop 30% of the interconnecting links/weights.
# A technique to help prevent overfitting or memorizing the training data. Can experiment with this parameter.

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 512)               8389120   
_________________________________________________________________
batch_normalization_12 (Batc (None, 512)               2048      
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 128)               65664     
_________________________________________________________________
batch_normalization_13 (Batc (None, 128)               512       
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 32)               

In [None]:
# Training
start_time = time.time()
history = ANN_model.fit(train_x, train_y, epochs=50, batch_size=64,validation_split=0.1,verbose = 2)
end_time = time.time()
exe_time = end_time - start_time
print("Execution time: ", exe_time)
scores = ANN_model.evaluate(test_x,test_y,verbose = 2)

Epoch 1/50
68/68 - 3s - loss: 0.8261 - accuracy: 0.5227 - val_loss: 0.7882 - val_accuracy: 0.0000e+00
Epoch 2/50
68/68 - 2s - loss: 0.4402 - accuracy: 0.8169 - val_loss: 0.9573 - val_accuracy: 0.0000e+00
Epoch 3/50
68/68 - 2s - loss: 0.2257 - accuracy: 0.9370 - val_loss: 1.6718 - val_accuracy: 0.0000e+00
Epoch 4/50
68/68 - 2s - loss: 0.1229 - accuracy: 0.9720 - val_loss: 2.5937 - val_accuracy: 0.0000e+00
Epoch 5/50
68/68 - 2s - loss: 0.0788 - accuracy: 0.9801 - val_loss: 3.6584 - val_accuracy: 0.0000e+00
Epoch 6/50
68/68 - 2s - loss: 0.0533 - accuracy: 0.9900 - val_loss: 3.9080 - val_accuracy: 0.0000e+00
Epoch 7/50
68/68 - 2s - loss: 0.0352 - accuracy: 0.9944 - val_loss: 4.4451 - val_accuracy: 0.0000e+00
Epoch 8/50
68/68 - 2s - loss: 0.0298 - accuracy: 0.9951 - val_loss: 4.3574 - val_accuracy: 0.0083
Epoch 9/50
68/68 - 2s - loss: 0.0295 - accuracy: 0.9931 - val_loss: 3.7646 - val_accuracy: 0.1000
Epoch 10/50
68/68 - 2s - loss: 0.0207 - accuracy: 0.9961 - val_loss: 4.1954 - val_accuracy

In [None]:
# Training set (4,800 samples) accuracy is 0.9996, loss is 0.0004
# Validation set (480 samples) accuracy is 0.4783, loss is 3.485

# To get the results of the ANN using test dataset
y_pred = ANN_model.predict(test_x) # 600 cases

epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
print(test_x.shape) # (600, 16384)
print(test_y.shape) # (600,)
print(y_pred.shape)

total_loss = binary_cross_entropy_check(test_y, y_pred)
print(f"** CHECK Average BCE Loss for multiple samples: {total_loss}") # NaN 

# To check on accuracy
# first, convert the elements in y_pred so that negative becones epsilon, largest is 1 - epsilon
epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
N=len(test_y) # 600 
threshold = 0.5 # for calculating accuracy
y_pred_binary = (y_pred >= threshold).astype(int)

accuracy = accuracy_score(test_y,y_pred_binary)
print("Accuracy Score:",accuracy)

########################################################
#  Count error cases (based on the 600 cases in the test set)
########################################################
test_y_binary = test_y
error0to1 = 0  # test_y ground truth is 0; but predicted as 1
error1to0 = 0  # test_y is 1; but predicted as 0
for i in range(N):
    if (test_y_binary[i] == 0 and y_pred_binary[i] == 1):
        error0to1 += 1
    if (test_y_binary[i] == 1 and y_pred_binary[i] == 0):
        error1to0 += 1
print('error0to1 = ',error0to1, '; ground truth label is 0')
print('error1to0 = ',error1to0, '; ground truth label is 1')
print('total error = ',error0to1+error1to0,' (', 100*(error0to1+error1to0)/N, '%)')
print('600 test cases: 300 label 0; 300 label 1')

(600, 16384)
(600,)
(600, 1)
** CHECK Average BCE Loss for multiple samples: [nan]
Accuracy Score: 0.47833333333333333
error0to1 =  143 ; label is 0
error1to0 =  170 ; label is 1
total error =  313 percentError =  52.166666666666664
600 test cases: 300 label 0; 300 label 1




In [None]:
# To get the results of the ANN using train dataset
y_pred = ANN_model.predict(train_x)
print(train_x.shape)
print(train_y.shape)
print(y_pred.shape)
total_loss = binary_cross_entropy_check(train_y, y_pred)
print(f"** CHECK Average BCE Loss for multiple samples: {total_loss}")

##########################
# Check accuracy
##########################
# convert the elements in y_pred so that negative becones epsilon, largest is 1 - epsilon
epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
N = len(train_y)
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)
accuracy = accuracy_score(train_y,y_pred_binary)
print("Accuracy Score:",accuracy)

##########################
# Count error cases:
##########################
train_y_binary = train_y
error0to1 = 0  # test_y is 0
error1to0 = 0  # test_y is 1
for i in range(N):
    if (train_y_binary[i] == 0 and y_pred_binary[i] == 1):
        error0to1 += 1
    if (train_y_binary[i] == 1 and y_pred_binary[i] == 0):
        error1to0 += 1
print('error0to1 = ',error0to1, '; label is 0')
print('error1to0 = ',error1to0, '; label is 1')
print('total error = ',error0to1+error1to0,' (', 100*(error0to1+error1to0)/N, '%)')
print('train cases: ', len(train_y))

(4800, 16384)
(4800,)
(4800, 1)
** CHECK Average BCE Loss for multiple samples: [nan]
Accuracy Score: 0.9404166666666667
error0to1 =  0 ; label is 0
error1to0 =  286 ; label is 1
total error =  286 percentError =  5.958333333333333
train cases:  4800




In [None]:
print(np.max(train_x)) #0.01398 (0.01 * 1.4) largest value 
print(np.min(train_x)) 

0.01398


In [None]:
# NO USE BELOW

#Define ANOTHER ANN model
input_dim = 16384
ANNmodel2 = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(1500,input_dim)),
    layers.Dense(128, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid') # Output layer for binary classification
])
# compile with cross entropy
ANNmodel2.compile(optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [None]:
# training
start_time = time.time()
history = ANNmodel2.fit(x_train, y_train, epochs=30, batch_size=32,verbose = 2)
end_time = time.time()
exe_time = end_time - start_time
print("Execution time: ", exe_time)

In [None]:
scores = ANNmodel2.evaluate(test_x,test_y,verbose = 2)

In [55]:
# test code
import numpy as np
from keras.losses import binary_crossentropy
import keras.backend as K

# Example true labels and predicted probabilities
y_true = np.array([0, 1, 1, 0, 1])
y_pred = np.array([0.1, 0.9, 0.8, 0.2, 0.7])

# Compute Binary Cross-Entropy using NumPy
def binary_cross_entropy(y_true, y_pred):
    bce = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return bce

bce_loss = binary_cross_entropy(y_true, y_pred)
print(f"***Binary Cross-Entropy Loss (function): {bce_loss}")

#===========================================================================
def binary_cross_entropy_np(y_true, y_pred):
    """
    Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

total_loss = binary_cross_entropy_np(y_true, y_pred)
print(f"--Average BCE Loss for multiple samples: {total_loss}")
#--------------------------------------------------------------------------

# Compute Binary Cross-Entropy using Keras
# DOES NOT WORK
# bce_loss_keras = binary_crossentropy(K.constant(y_true), K.constant(y_pred)).numpy()
# print(f"Binary Cross-Entropy Loss (Keras): {bce_loss_keras}")

***Binary Cross-Entropy Loss (function): 0.20273661557656092
--Average BCE Loss for multiple samples: 0.20273661557656092


In [84]:
aaa = np.array([0, 1, 1, -2.5, 1 , 3.4])
bbb = np.array([0.1, 0.9, 0.8, 0.2, 0.7])

print(max(aaa))
print(min(aaa))

3.4
-2.5
