In [369]:
# ANN Training
import numpy as np
import pandas as pd
import random
import copy
import time
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.layers import Dropout


In [None]:
from keras.losses import binary_crossentropy

def binary_cross_entropy_np(y_true, y_pred):
    """
    Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

# Compute Binary Cross-Entropy using own calculaton for checking
def binary_cross_entropy_check(y_true, y_pred):
    """
    Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    N=len(y_true)
    BCE_sum=0
    for i in range(N):
        BCE_sum = BCE_sum + y_true[i]*np.log(y_pred[i]) + (1-y_true[i])*np.log((1-y_pred[i])) 
    BCE = (-1*BCE_sum)/N
    return BCE

## Create input dataset

`generate_dataset_new2()` is the main function. The input is 1x2000 which is  [ ref vector  variant vector ]

The training dataset (balanced) is as follows:
*  [one_ref_vector  variant_vector_1 (80 locations modified)]    label 1: 10 cases
*  [one_ref_vector  variant_vector_2 (20 locations modified)]    label 0: 5 cases
*  [one_ref_vector  variant_vector_3 similar to one_ref_vector]  label 0: 5 cases (similar means multiplied by a TINY magnitude)

Benign variants is designed to have changes of 2% of elements (label 0). Pathogenic variants have a higher frequency of alterations of 10% of elements(label 1).

In [None]:
# Ref vector is 1x1000; variant vector is 1x1000
def generate_dataset_new2(num_items,size_ref_vector,label_1_matrix,label_0_matrix,label_1_sign,label_0_sign):
    train_x = np.empty((0,2*size_ref_vector))
    train_y = np.empty((0,1))
    # "similar" variant is obtained by multiplying the ref vector by a factor in range [0.99, 1.01]
    lowerB = 0.99
    upperB = 1.01
    # Change elements (those 80 locations) of the variant vector by a factor in range [1.2, 1.5]
    lowerR = 1.2  # modify individual element of variant vector
    upperR = 1.5 

    for i in range(0,num_items):
        
        ref_vector1 = np.array(np.random.uniform(low=0.01, high=0.6, size=size_ref_vector))
        # NO USE: index = (i % 20)  #label 1 10 cases, label 0 also 10 cases
        # lABEL 1
        for icase in range(0,10):
            pos = np.array(label_1_matrix[icase,:]) # 1 x 80 
            kk_sign = np.array(label_1_sign[icase,:])
            counter = 0
            
            # Initial variant vector (from copying ref vector)* adjustment
            k_adjust = random.uniform(lowerB,upperB)
            var_vector1 = copy.deepcopy(ref_vector1)*k_adjust
            
            for j in pos:
                kk = random.uniform(lowerR,upperR)  
                var_vector1[j] = ref_vector1[j]*kk*kk_sign[counter]
                counter = counter + 1
            zzz_x = np.concatenate((ref_vector1,var_vector1))   
            zzz_y = np.array([1])
            train_x = np.append(train_x,[zzz_x],axis=0)
            train_y = np.append(train_y,[zzz_y],axis=0)

        # benign lABEL 0
        for icase in range(0,5):  # 5 cases here
            pos = np.array(label_0_matrix[icase,:])
            kk_sign = np.array(label_0_sign[icase,:])
            counter = 0
            
            # Initial variant is obtained from deepcopy
            k_adjust = random.uniform(lowerB,upperB)
            var_vector1 = copy.deepcopy(ref_vector1)*k_adjust
            
            for j in pos:
                kk = random.uniform(lowerR,upperR)
                var_vector1[j] = ref_vector1[j]*kk*kk_sign[counter]
                # print('-0- ', icase,j,kk,counter,kk_sign[counter])
                counter = counter + 1
            zzz_x = np.concatenate((ref_vector1,var_vector1))   
            zzz_y = np.array([0])
            train_x = np.append(train_x,[zzz_x],axis=0)
            train_y = np.append(train_y,[zzz_y],axis=0)
            
        # benign lABEL 0 with "similar" variants (+/- 1%)
        for icase in range(0,5):  # 5 cases here 
            # Initial variant is obtained from deepcopy
            k_adjust = random.uniform(lowerB,upperB)
            var_vector1 = copy.deepcopy(ref_vector1)*k_adjust
            zzz_x = np.concatenate((ref_vector1,var_vector1))   
            zzz_y = np.array([0])
            train_x = np.append(train_x,[zzz_x],axis=0)
            train_y = np.append(train_y,[zzz_y],axis=0)
        # np.set_printoptions(precision=4)
        # print(zzz)
    return train_x, train_y

In [None]:
size_ref_vector = 1000

ref_elements_to_change = int(size_ref_vector*0.1)  # 8%
var_elements_to_change = int(size_ref_vector*0.02) # 2%

label_1_matrix = np.random.randint(0,size_ref_vector-1, size=(10,ref_elements_to_change) )
label_0_matrix = np.random.randint(0,size_ref_vector-1, size=(10,var_elements_to_change) )
label_1_sign = np.random.choice([-1,1], size=(10,ref_elements_to_change) )
label_0_sign = np.random.choice([-1,1], size=(10,var_elements_to_change) )

print('label_1_matrix.shape: ' , label_1_matrix.shape) # (10, 100)
print('label_0_matrix.shape: ' , label_0_matrix.shape) # (10, 20)
print('label_1_sign.shape: ' , label_1_sign.shape) # (10, 100)
print('label_0_sign.shape: ' , label_0_sign.shape) # (10, 20)

Nset = 100 # 1 set of data = 20 samples, 100 set = 2000 samples 
print('Generating the training dataset ...')
train_x, train_y=generate_dataset_new2(Nset,size_ref_vector,label_1_matrix,label_0_matrix,label_1_sign,label_0_sign)
print(train_x.shape) # (2000, 2000)
print(train_y.shape) # (2000, 1); input is 1x2000 which is  [ ref vector  variant vector ]

label_1_matrix.shape:  (10, 100)
label_0_matrix.shape:  (10, 20)
label_1_sign.shape:  (10, 100)
label_0_sign.shape:  (10, 20)
Generating the training dataset ...
(2000, 2000)
(2000, 1)


In [None]:
# Each set of Test dataset is Nset x 80
print('Generating the testing dataset ...')
test_x, test_y=generate_dataset_new2(100,size_ref_vector,label_1_matrix,label_0_matrix,label_1_sign,label_0_sign)
print(test_x.shape) # (2000, 2000)
print(test_y.shape) # (2000, 1)

Generating the testing dataset ...
(2000, 2000)
(2000, 1)


## ANN Training

In [None]:
start_time = time.time()
history = ANN_model.fit(train_x, train_y, epochs=50, batch_size=64,validation_split=0.15,verbose = 2)
end_time = time.time()
exe_time = end_time - start_time
print("Execution time: ", exe_time)
scores = ANN_model.evaluate(test_x,test_y,verbose = 2)
print("Testing Accuracy = ", scores)

Epoch 1/50
27/27 - 0s - loss: 0.1589 - accuracy: 0.9594 - val_loss: 0.2049 - val_accuracy: 0.9400
Epoch 2/50
27/27 - 0s - loss: 0.0070 - accuracy: 0.9994 - val_loss: 0.1491 - val_accuracy: 0.9500
Epoch 3/50
27/27 - 0s - loss: 0.0030 - accuracy: 0.9994 - val_loss: 0.0550 - val_accuracy: 0.9833
Epoch 4/50
27/27 - 0s - loss: 0.0044 - accuracy: 0.9988 - val_loss: 0.0409 - val_accuracy: 0.9867
Epoch 5/50
27/27 - 0s - loss: 8.4153e-04 - accuracy: 1.0000 - val_loss: 0.0161 - val_accuracy: 0.9933
Epoch 6/50
27/27 - 0s - loss: 0.0016 - accuracy: 1.0000 - val_loss: 0.0049 - val_accuracy: 1.0000
Epoch 7/50
27/27 - 0s - loss: 0.0011 - accuracy: 1.0000 - val_loss: 0.0016 - val_accuracy: 1.0000
Epoch 8/50
27/27 - 0s - loss: 9.3951e-04 - accuracy: 1.0000 - val_loss: 7.7027e-04 - val_accuracy: 1.0000
Epoch 9/50
27/27 - 0s - loss: 6.2903e-04 - accuracy: 1.0000 - val_loss: 5.2322e-04 - val_accuracy: 1.0000
Epoch 10/50
27/27 - 0s - loss: 7.1072e-04 - accuracy: 1.0000 - val_loss: 3.9793e-04 - val_accuracy

## Evaluating training results

In [None]:
# To get the results of the ANN using test dataset
y_pred = ANN_model.predict(test_x)

epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities

print(test_x.shape) # (1000, 2000)
print(test_y.shape) # (1000, 1)
print(y_pred.shape) # (1000, 1)

total_loss = binary_cross_entropy_check(test_y, y_pred)
print(f"** CHECK Average BCE Loss for multiple samples: {total_loss}")

# To check on accuracy
# first, convert the elements in y_pred so that negative becones epsilon, largest is 1 - epsilon
epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
N=len(test_y)
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

accuracy = accuracy_score(test_y,y_pred_binary)
print("Accuracy Score:",accuracy)

#Count error cases:
#
test_y_binary = test_y
error0to1 = 0  # test_y is 0
error1to0 = 0  # test_y is 1
for i in range(N):
    if (test_y_binary[i] == 0 and y_pred_binary[i] == 1):
        error0to1 += 1
    if (test_y_binary[i] == 1 and y_pred_binary[i] == 0):
        error1to0 += 1
        
print('error0to1 = ',error0to1, '; label is 0') # test_y is 0, but predicted as 1
print('error1to0 = ',error1to0, '; label is 1') # test_y is 1, but predicted as 0
print('Testing total error = ',error0to1+error1to0,'percentError = ', 100*(error0to1+error1to0)/N) # Testing total error =  11 percentError =  0.55
print('2000 test cases: 1000 label 0; 1000 label 1') # 2000 test cases: 1000 label 0; 1000 label 1

(2000, 2000)
(2000, 1)
(2000, 1)
** CHECK Average BCE Loss for multiple samples: [0.0191]
Accuracy Score: 0.9945
error0to1 =  10 ; label is 0
error1to0 =  1 ; label is 1
Testing total error =  11 percentError =  0.55
2000 test cases: 1000 label 0; 1000 label 1


In [None]:
print('test_x.shape ', test_x.shape)
print('test_y.shape ',test_y.shape)
print(y_pred_binary.shape)

print('Compare between test_y and y_pred_binary')
N=len(test_y)

errorcase=[0]*20
for i in range(0,len(test_y)):
    index = (i % 20)
    if (test_y[i] != y_pred_binary[i]): 
        errorcase[index] = errorcase[index]+1
print(errorcase)
print('Sum of errors = ' ,np.sum(errorcase), '  Percent Error', 100*np.sum(errorcase)/N)  # sum of erorrs = 5, percent error (0.25%)

test_x.shape  (2000, 2000)
test_y.shape  (2000, 1)
(2000, 1)
Compare between test_y and y_pred_binary
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1]
Sum of errors =  11   Percent Error 0.55


In [None]:
print('train_x.shape ', train_x.shape)
print('train_y.shape ',train_y.shape)

# To get the results of the ANN using test dataset
y_pred = ANN_model.predict(train_x)

epsilon = 1e-15  # Small value to prevent log(0)
y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities

N=len(train_y)
threshold = 0.5
y_pred_binary = (y_pred >= threshold).astype(int)

print(y_pred_binary.shape)
print('Compare between train_y and y_pred_binary')

errorcase=[0]*20
for i in range(0,len(train_y)):
    index = (i % 20)
    if (train_y[i] != y_pred_binary[i]): 
        errorcase[index] = errorcase[index]+1
print(errorcase)
print('len(error) '  ,len(errorcase))
print('Sum of errors = ' ,np.sum(errorcase), '  Percent Error', 100*np.sum(errorcase)/N)

train_x.shape  (2000, 2000)
train_y.shape  (2000, 1)
(2000, 1)
Compare between train_y and y_pred_binary
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(error)  20
Sum of errors =  0   Percent Error 0.0


# Repeat using the delta 'difference' vector

In [None]:
# Getting the difference vector of the training cases for training
# input is now 1 x 1000; binary output
ref_vector = train_x[:,0:1000]
var_vector = train_x[:,1000:2000]
diff_vector = np.abs(ref_vector - var_vector)
print(ref_vector.shape,var_vector.shape,diff_vector.shape  ) # (2000, 1000) (2000, 1000) (2000, 1000)

(2000, 1000) (2000, 1000) (2000, 1000)


In [None]:
# ANN model of the 1000 inputs; with halved number of neurons (smaller model!)
def baseline_model4():
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim = 1000))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [None]:
ANN_model_diff = baseline_model4()
history = ANN_model_diff.fit(diff_vector, train_y, epochs=30, batch_size=64,validation_split=0.15,verbose = 2)

Epoch 1/30
27/27 - 1s - loss: 0.1970 - accuracy: 0.9329 - val_loss: 0.4121 - val_accuracy: 0.9967
Epoch 2/30
27/27 - 0s - loss: 0.0277 - accuracy: 0.9994 - val_loss: 0.2967 - val_accuracy: 1.0000
Epoch 3/30
27/27 - 0s - loss: 0.0171 - accuracy: 1.0000 - val_loss: 0.2278 - val_accuracy: 1.0000
Epoch 4/30
27/27 - 0s - loss: 0.0142 - accuracy: 1.0000 - val_loss: 0.1705 - val_accuracy: 1.0000
Epoch 5/30
27/27 - 0s - loss: 0.0097 - accuracy: 1.0000 - val_loss: 0.1277 - val_accuracy: 1.0000
Epoch 6/30
27/27 - 0s - loss: 0.0079 - accuracy: 1.0000 - val_loss: 0.0919 - val_accuracy: 1.0000
Epoch 7/30
27/27 - 0s - loss: 0.0075 - accuracy: 1.0000 - val_loss: 0.0630 - val_accuracy: 1.0000
Epoch 8/30
27/27 - 0s - loss: 0.0061 - accuracy: 1.0000 - val_loss: 0.0416 - val_accuracy: 1.0000
Epoch 9/30
27/27 - 0s - loss: 0.0052 - accuracy: 1.0000 - val_loss: 0.0273 - val_accuracy: 1.0000
Epoch 10/30
27/27 - 0s - loss: 0.0057 - accuracy: 1.0000 - val_loss: 0.0173 - val_accuracy: 1.0000
Epoch 11/30
27/27 -

In [None]:
# Results based on the vector difference of the 2000 test cases
ref_vector_test = test_x[:,0:1000]
var_vector_test = test_x[:,1000:2000]
diff_vector_test = np.abs(ref_vector_test - var_vector_test)
print(ref_vector_test.shape,var_vector_test.shape,diff_vector_test.shape  ) # (2000, 1000) (2000, 1000) (2000, 1000)

# Evaluate on test set 
y_pred = ANN_model_diff.predict(diff_vector_test)

loss_diff, acc_diff = ANN_model_diff.evaluate(diff_vector_test,test_y,verbose = 2) # 63/63 - 0s - loss: 2.6855e-04 - accuracy: 1.0000
print(loss_diff, acc_diff) # 0.0002685519284568727 1.0

(2000, 1000) (2000, 1000) (2000, 1000)
63/63 - 0s - loss: 2.6855e-04 - accuracy: 1.0000
0.0002685519284568727 1.0


In [None]:
import os
DIR = "path"
out_path = os.path.join(DIR, "trainX_vectors3.csv")
print(out_path)
np.savetxt(out_path,DBset,delimiter=',',fmt='%.5f')

In [None]:
out_path = os.path.join(DIR, "trainY_vectors3.csv")
print(out_path)
np.savetxt(out_path,DBy,delimiter=',',fmt='%.5f')

In [None]:
out_path = os.path.join(DIR, "testX_vectors3.csv")
print(out_path)
np.savetxt(out_path,DBset_test,delimiter=',',fmt='%.5f')

In [40]:
# NO USE BELOW

#Define ANOTHER ANN model
input_dim = 16384
ANNmodel2 = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(1500,input_dim)),
    layers.Dense(128, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid') # Output layer for binary classification
])

# compile with cross entropy
ANNmodel2.compile(optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [None]:
# training
start_time = time.time()
history = ANNmodel2.fit(x_train, y_train, epochs=30, batch_size=32,verbose = 2)
end_time = time.time()
exe_time = end_time - start_time
print("Execution time: ", exe_time)

In [None]:
scores = ANNmodel2.evaluate(test_x,test_y,verbose = 2)

In [55]:
# test code
import numpy as np
from keras.losses import binary_crossentropy
import keras.backend as K

# Example true labels and predicted probabilities
y_true = np.array([0, 1, 1, 0, 1])
y_pred = np.array([0.1, 0.9, 0.8, 0.2, 0.7])

# Compute Binary Cross-Entropy using NumPy
def binary_cross_entropy(y_true, y_pred):
    bce = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return bce

bce_loss = binary_cross_entropy(y_true, y_pred)
print(f"***Binary Cross-Entropy Loss (function): {bce_loss}")

#===========================================================================
def binary_cross_entropy_np(y_true, y_pred):
    """
    Calculates Binary Cross-Entropy loss for multiple samples using NumPy.
    y_true: NumPy array of actual labels (0s and 1s)
    y_pred: NumPy array of predicted probabilities (between 0 and 1)
    """
    epsilon = 1e-15  # Small value to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # Clip probabilities
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

total_loss = binary_cross_entropy_np(y_true, y_pred)
print(f"--Average BCE Loss for multiple samples: {total_loss}")
#--------------------------------------------------------------------------

# Compute Binary Cross-Entropy using Keras
# DOES NOT WORK
# bce_loss_keras = binary_crossentropy(K.constant(y_true), K.constant(y_pred)).numpy()
# print(f"Binary Cross-Entropy Loss (Keras): {bce_loss_keras}")

***Binary Cross-Entropy Loss (function): 0.20273661557656092
--Average BCE Loss for multiple samples: 0.20273661557656092


In [84]:
aaa = np.array([0, 1, 1, -2.5, 1 , 3.4])
bbb = np.array([0.1, 0.9, 0.8, 0.2, 0.7])

print(max(aaa))
print(min(aaa))

3.4
-2.5


In [None]:
    DBset = np.empty((0,4))
    DBset_tmp = np.array([1,2,3,4])
    DBset = np.append(DBset,[DBset_tmp],axis=0)
    print(DBset)
    
    DBset_tmp = np.array([5,6,7,8])
    DBset = np.append(DBset,[DBset_tmp],axis=0)

    print(DBset)
    print('========================')
    DBset_tmp = np.array([9,10,11,12])
    DBset = np.append(DBset,[DBset_tmp],axis=0)

    print(DBset)

In [None]:
# NO NEED TO LOAD / READIN

# READ IN CSV (takes 30 seconds)
new_train_values = np.loadtxt("train_vectors2.csv",delimiter=",")
print(new_train_values.shape)
new_train = new_train_values.reshape((4000,16385))
print('new_train.shape = ',new_train.shape)


new_test_values = np.loadtxt("test_vectors2.csv",delimiter=",")
print(new_test_values.shape)
new_test = new_test_values.reshape((4000,16385))
print('new_test.shape = ',new_test.shape)

train_x = new_train[:,0:16384] 
train_y = new_train[:,16384]
print(train_x.shape)
print(train_y.shape)
#===================
test_x = new_test[:,0:16384] 
test_y = new_test[:,16384]
print(test_x.shape)
print(test_y.shape)