# Synthetic dataset generation for neural network training

To create a synthetic dataset for training a neural network to predict BRCA1 variant labels (0=benign or 1=pathogenic). This dataset consists of feature vectors (`ref_vector1`, `var_vector1`) and their associated labels. It uses randomly generated data to simulate variations in the feature values

In [None]:
import os
import numpy as np
import pandas as pd
import random
import copy
DIR = "/mnt/nfs/rigenenfs/shared_resources/biobanks/UKBIOBANK/pangk/evo2/NN"
input_path = os.path.join(DIR, "train_vectors2.csv")
out_train = os.path.join(DIR, "train_vectors2.csv")
print(out_train) # /mnt/nfs/rigenenfs/shared_resources/biobanks/UKBIOBANK/pangk/evo2/NN/train_vectors2.csv
out_test = os.path.join(DIR, "test_vectors2.csv")
print(out_test)  # /mnt/nfs/rigenenfs/shared_resources/biobanks/UKBIOBANK/pangk/evo2/NN/test_vectors2.csv

### Define functions

In [None]:
# label_value = 0
# percent_n = 10 
# default size is 8192
# output dimension is 1x16385 which 8192+8192+1
# upperR = 1.4
# lowerR = 0.6

def create_one_case(percent_n,label_value,lowerR,upperR):
    """
    Generates one pair of feature vectors
        ref_vector1 with random values between 0.00001 and 0.01
        var_vector1 derived from ref_vector1 by modifying a certain % (percent_n) of its elements. Selected elements are multiplied by a random value between lowerR and upperR
    Parameters:
        percent_n (float): percentage of elements in ref_vector1 to modify in var_vector1.
        label_value (int): label for the generated data point (e.g., 0 for benign, 1 for malignant).
        lowerR (float): lower bound of the random multiplier to modify elements in var_vector1.
        upperR (float): upper bound of the random multiplier to modify elements in var_vector1.
    Returns:
        np.ndarray: combined_array; a concatenated array containing ref_vector1, var_vector1, A 1D array of length 16385:
        The final element is the label_value.
    """
    # Ref_vector 1
    ref_vector1 = np.array(np.random.uniform(low=0.00001, high=0.01, size=8192)) # need to scale to between 0 to 1 (low=0.00001, high=1)
    # Initial variant is obtained from deepcopy
    var_vector1 = copy.deepcopy(ref_vector1)
    num_elements_to_change = int(len(ref_vector1)*(percent_n/100))
    indices_to_change = random.sample(range(len(ref_vector1)),num_elements_to_change)
    #To change the selected elements in the range between 0.8 and 1.2
    for index in indices_to_change:
        kk = random.uniform(lowerR,upperR)
        var_vector1[index] = ref_vector1[index]*kk
    # Clip probabilities (important to ensure probabilities stay within [0, 1])
    epsilon = 1e-15  # Small value to prevent log(0)
    var_vector1 = np.clip(var_vector1, epsilon, 1 - epsilon)  # Clip probabilities
    count_of_change = 0
    for i in range(int(len(ref_vector1))):
        if (abs(ref_vector1[i] - var_vector1[i]) > 0.0000000001):
            count_of_change = count_of_change + 1
    # Checking for error:
    if (num_elements_to_change != count_of_change):
        print('--- Minor Warning:num_elements_to_change NOT.EQUAL count_of_change; details are ', num_elements_to_change, count_of_change)   
    combined_array = np.concatenate((ref_vector1,var_vector1,np.array([label_value])))
    return combined_array

# Generate N rows of 10% difference, label 0
def create_N_case(N,percent_n,label_value):
    """
    Generates a 2D array.
    Parameters
        N (int): The number of rows (data points) to generate.
        percent_n (float): The percentage of elements in each reference vector to modify.
        label_value (int): The label for all rows in the dataset.
    Returns:
        two_D_array (numpy array): A 2D array of shape (N, 16385), each row contains the reference vector, variant vector, and label.
    """
    # Create the first item
    two_D_array = np.array( [ create_one_case(percent_n,label_value,lowerR,upperR)  ] )  #(1, 16385)
    for i in range(N-1):
        new_row = create_one_case(percent_n,label_value,lowerR,upperR)
        two_D_array = np.insert(two_D_array,1,new_row,axis = 0) #insert as second row
    print('*** Finished creating the 2D array: ',two_D_array.shape, 'percent_n =',percent_n, ' label_value =',label_value, ' N=',N)
    return  two_D_array


### 1. Generate dataset with 800 samples
Outputs a (800, 16385) array where the first 8192 columns are from `ref_vector1` and the next 8192 columns are from `var_vector1`.

Note: `percent_n` controls how much variation exists between `ref_vector1` and `var_vector1`. Higher values of percent_n simulate greater changes,  which might correspond to more severe or clear biological differences

In [None]:
# Case 1: Benign Data (label = 0)
label_value = 0
percent_n = 0.1
N = 800
two_D_array_10percent = create_N_case(N,percent_n,label_value)

label_value = 0
percent_n = 0.2
N = 800
two_D_array_20percent = create_N_case(N,percent_n,label_value)

label_value = 0
percent_n = 0.3
N = 800
two_D_array_30percent = create_N_case(N,percent_n,label_value)

# Case 1: Pathogenic data (label= 1)
label_value = 1
percent_n = 2
N = 800
two_D_array_60percent = create_N_case(N,percent_n,label_value)

label_value = 1
percent_n = 3
N = 800
two_D_array_70percent = create_N_case(N,percent_n,label_value)

label_value = 1
percent_n = 4 # 800 samples with 4% of the ref_vector1 elements modified in var_vector1.
N = 800 
two_D_array_80percent = create_N_case(N,percent_n,label_value)

*** Finished creating the 2D array:  (800, 16385) percent_n = 0.1  label_value = 0  N= 800
*** Finished creating the 2D array:  (800, 16385) percent_n = 0.2  label_value = 0  N= 800
*** Finished creating the 2D array:  (800, 16385) percent_n = 0.3  label_value = 0  N= 800
*** Finished creating the 2D array:  (800, 16385) percent_n = 2  label_value = 1  N= 800
*** Finished creating the 2D array:  (800, 16385) percent_n = 3  label_value = 1  N= 800
*** Finished creating the 2D array:  (800, 16385) percent_n = 4  label_value = 1  N= 800


In [None]:
# Stacking all the rows for training
DB_train = two_D_array_10percent
DB_train = np.vstack((DB_train,two_D_array_20percent))
DB_train = np.vstack((DB_train,two_D_array_30percent))
DB_train = np.vstack((DB_train,two_D_array_60percent))
DB_train = np.vstack((DB_train,two_D_array_70percent))
DB_train = np.vstack((DB_train,two_D_array_80percent))
print(DB_train.shape) # (4800, 16385)

# DB_train.tofile(out_path,sep=',',fmt='%.5f')
np.savetxt(out_train,DB_train,delimiter=',',fmt='%.5f')
print("Finished Export of Training cases ")

(4800, 16385)


In [20]:
# Create for TESTING case
label_value = 0
percent_n = 0.1
N = 100
two_D_array_10percent = create_N_case(N,percent_n,label_value)

label_value = 0
percent_n = 0.2
N = 100
two_D_array_20percent = create_N_case(N,percent_n,label_value)

label_value = 0
percent_n = 0.3
N = 100
two_D_array_30percent = create_N_case(N,percent_n,label_value)

# Below are for label = 1
label_value = 1
percent_n = 2
N = 100
two_D_array_60percent = create_N_case(N,percent_n,label_value)

label_value = 1
percent_n = 3
N = 100
two_D_array_70percent = create_N_case(N,percent_n,label_value)

label_value = 1
percent_n = 4
N = 100
two_D_array_80percent = create_N_case(N,percent_n,label_value)

*** Finished creating the 2D array:  (100, 16385) percent_n = 0.1  label_value = 0  N= 100
*** Finished creating the 2D array:  (100, 16385) percent_n = 0.2  label_value = 0  N= 100
*** Finished creating the 2D array:  (100, 16385) percent_n = 0.3  label_value = 0  N= 100
*** Finished creating the 2D array:  (100, 16385) percent_n = 2  label_value = 1  N= 100
*** Finished creating the 2D array:  (100, 16385) percent_n = 3  label_value = 1  N= 100
*** Finished creating the 2D array:  (100, 16385) percent_n = 4  label_value = 1  N= 100


In [None]:
# Stacking all the rows for testing (450 cases)
DB_test = two_D_array_10percent
DB_test = np.vstack((DB_test,two_D_array_20percent))
DB_test = np.vstack((DB_test,two_D_array_30percent))
DB_test = np.vstack((DB_test,two_D_array_60percent))
DB_test = np.vstack((DB_test,two_D_array_70percent))
DB_test = np.vstack((DB_test,two_D_array_80percent))
print(DB_test.shape) # (600, 16385)

np.savetxt(out_test,DB_test,delimiter=',',fmt='%.5f')
print("Finished Export of Testing cases ")

In [22]:
# READ CSV
new_test_values = np.loadtxt("test_vectors2.csv",delimiter=",")

In [23]:
# TEST the reshape
print(new_test_values.shape)
new_test = new_test_values.reshape((600,16385))
print(new_test.shape)


(600, 16385)
(600, 16385)


In [None]:
################################## NO USE BELOW ##################################

In [67]:
label_value = 0
percent_n = 10

# two_D_array = np.array([create_one_case(percent_n,label_value)] , [create_one_case(percent_n,label_value)])
two_D_array = np.array( [ create_one_case(percent_n,label_value)  ] )
print('A', two_D_array.shape)
new_row = create_one_case(percent_n,label_value)
two_D_array = np.insert(two_D_array,0,new_row,axis = 0) #insert as second row
print('B', two_D_array.shape)
two_D_array = np.insert(two_D_array,1,new_row,axis = 0) #insert as second row
print('B', two_D_array.shape)


A (1, 16385)
B (2, 16385)
B (3, 16385)


In [41]:
#######################################################
# Step 1: create samples of input ref/var vector pairs

# Ref_vector 1
ref_vector1 = np.array(np.random.uniform(low=0.00001, high=0.01, size=8192))
# Initial variant is obtained from deepcopy
var_vector1 = copy.deepcopy(ref_vector1)
 


In [42]:
# percentage to change
label_value = 0
percent_n = 10
num_elements_to_change = int(len(ref_vector1)*(percent_n/100))
indices_to_change = random.sample(range(len(ref_vector1)),num_elements_to_change)
#To change the selected elements in the range between 0.8 and 1.2
for index in indices_to_change:
    kk = random.uniform(0.8,1.2)
    var_vector1[index] = ref_vector1[index]*kk

#print(ref_vector1[:30])
print(num_elements_to_change)

819


In [45]:
# Checking: to make sure the elements have been modified
count_of_change = 0
for i in range(int(len(ref_vector1))):
    if (abs(ref_vector1[i] - var_vector1[i]) > 0.0000000001):
        #print('i =',i,' diff: ',ref_vector1[i],var_vector1[i])
        count_of_change = count_of_change + 1
               
print(count_of_change)
print(len(ref_vector1))

819
8192


In [47]:
combined_array = np.concatenate((ref_vector1,var_vector1,np.array([0])))
print(combined_array.shape) #8192 x 2 + 1 = 16385


(16385,)


In [17]:
two_D_array = np.array([[1,2,3]])
new_row = np.array([7,8,9])
updated_array = np.insert(two_D_array,1,new_row,axis = 0) #insert as second row

print(new_row.shape)
print(two_D_array.shape)
print(updated_array.shape)

(3,)
(1, 3)
(2, 3)


In [5]:
two_D_array = np.array([[1,2,3],[4,5,6]])


new_row = np.array([7,8,9])
updated_array = np.insert(two_D_array,1,new_row,axis = 0) #insert as second row

print(updated_array.shape)

(3, 3)


In [None]:
# No Use:  Create a single dataframe by combining the three vectors
df_combined = pd.DataFrame({
    "Ref_vector": ref_vector1,
    "Variant_vector": var_vector1,
    #"Label": label_vector
})

# Step 4: save to CSV
print(df_combined.head())
print(df_combined.shape)
df_combined.to_csv(input_path, index=False)

In [None]:
# No use
count_of_999 = 0
for i in range(int(len(ref_vector1))):
        if (ref_vector1[i] == 999):
               count_of_999 = count_of_999 + 1
               
print(count_of_999)
print(len(ref_vector1))