### coding strategy benchmarking

In [1]:
import numpy as np
import pandas as pd

benchmark_df = pd.DataFrame(columns=['coding strategy','abundance','differentiation','time cost','can be pooled?',
                'positional interpretability','functional interpretability'])
data_path = '../datasets/benchmark_datasets/'

#### coding 1: One-Hot

In [2]:
# benchmarking coding strategy
coding_strategy = 'One-Hot'
can_be_pooled = 'No'
positional_interpretability = 'Yes'
functional_interpretability = 'No'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'onehot.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.array(data['embedding_before'][i])
    embedding_after = np.array(data['embedding_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape = embedding_before.shape
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]
    #print(embedding_shape)
    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy  abundance  differentiation     time cost can be pooled?  \
0         One-Hot        4.0         0.003992  7.958231e-07             No   

  positional interpretability functional interpretability  
0                         Yes                          No  


  benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,


#### coding 2: DNABert2

In [3]:
# benchmarking coding strategy
coding_strategy = 'DNABert2'
can_be_pooled = 'Yes'
positional_interpretability = 'No'
functional_interpretability = 'Yes'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'dnabert.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.squeeze(data['embedding_before'][i])
    embedding_after = np.squeeze(data['embedding_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape_before = embedding_before.shape
    embedding_shape_after = embedding_after.shape
    # if shape doesn't match
    if(embedding_shape_before[0] < embedding_shape_after[0]):
        embedding_before = np.pad(embedding_before,((0,0),(0,embedding_shape_after[0]-embedding_shape_before[0])),constant_values = (0,0))
    elif(embedding_shape_before[0] > embedding_shape_after[0]):
        embedding_after = np.pad(embedding_after,((0,embedding_shape_before[0]-embedding_shape_after[0]),(0,0)),constant_values = (0,0))

    embedding_shape = embedding_shape_before
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]

    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy   abundance  differentiation     time cost can be pooled?  \
0         One-Hot    4.000000         0.003992  7.958231e-07             No   
1        DNABert2  469.537725         9.314390  1.075537e-03            Yes   

  positional interpretability functional interpretability  
0                         Yes                          No  
1                          No                         Yes  


#### coding 3: GPN

In [4]:
# benchmarking coding strategy
coding_strategy = 'GPN'
can_be_pooled = 'Yes'
positional_interpretability = 'Yes'
functional_interpretability = 'Yes'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'gpn.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.array(data['embedding_before'][i])
    embedding_after = np.array(data['embedding_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape = embedding_before.shape
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]
    #print(embedding_shape) # (1, 501, 512)
    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy   abundance  differentiation     time cost can be pooled?  \
0         One-Hot    4.000000         0.003992  7.958231e-07             No   
1        DNABert2  469.537725         9.314390  1.075537e-03            Yes   
2             GPN  512.000000         1.308631  5.871583e-03            Yes   

  positional interpretability functional interpretability  
0                         Yes                          No  
1                          No                         Yes  
2                         Yes                         Yes  


#### coding 4: Hyena

In [5]:
# benchmarking coding strategy
coding_strategy = 'hyena'
can_be_pooled = 'Yes'
positional_interpretability = 'Yes'
functional_interpretability = 'Yes'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'hyena.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.array(data['hyena_before'][i])
    embedding_after = np.array(data['hyena_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape = embedding_before.shape
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]
    #print(embedding_shape) # (1, 501, 512)
    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy   abundance  differentiation     time cost can be pooled?  \
0         One-Hot    4.000000         0.003992  7.958231e-07             No   
1        DNABert2  469.537725         9.314390  1.075537e-03            Yes   
2             GPN  512.000000         1.308631  5.871583e-03            Yes   
3           hyena  128.510978         0.523388  6.818338e-05            Yes   

  positional interpretability functional interpretability  
0                         Yes                          No  
1                          No                         Yes  
2                         Yes                         Yes  
3                         Yes                         Yes  


#### coding 5: NT

In [6]:
# benchmarking coding strategy
coding_strategy = 'NT'
can_be_pooled = 'Yes'
positional_interpretability = 'Yes'
functional_interpretability = 'Yes'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'NT.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.array(data['NT_before'][i])
    embedding_after = np.array(data['NT_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape = embedding_before.shape
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]
    #print(embedding_shape) # (1, 501, 512)
    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy    abundance  differentiation     time cost can be pooled?  \
0         One-Hot     4.000000         0.003992  7.958231e-07             No   
1        DNABert2   469.537725         9.314390  1.075537e-03            Yes   
2             GPN   512.000000         1.308631  5.871583e-03            Yes   
3           hyena   128.510978         0.523388  6.818338e-05            Yes   
4              NT  1277.445110         4.602806  1.372732e-02            Yes   

  positional interpretability functional interpretability  
0                         Yes                          No  
1                          No                         Yes  
2                         Yes                         Yes  
3                         Yes                         Yes  
4                         Yes                         Yes  


#### code 5: enformer

In [7]:
# benchmarking coding strategy
coding_strategy = 'enformer'
can_be_pooled = 'Yes'
positional_interpretability = 'No'
functional_interpretability = 'Yes'

# calculating metrics
L = 501
data = pd.read_pickle(data_path + 'enformer.dataset')
#print(data.head())

T_list = []
abundance_list = []
differentiation_list = []
for i in range(len(data)):
    embedding_before = np.array(data['embedding_before'][i])
    embedding_after = np.array(data['embedding_after'][i])
    time_before = data['time_before'][i]
    time_after = data['time_after'][i]

    embedding_shape = embedding_before.shape
    dim_sum = 1
    for j in range(len(embedding_shape)):
        dim_sum *= embedding_shape[j]
    #print(embedding_shape) # (1, 501, 512)
    T_list.append((time_before + time_after)/float(L))
    abundance_list.append(dim_sum/float(L))

    x_before_normal = (embedding_before - np.amin(embedding_before))/float(np.ptp(embedding_before))
    x_after_normal = (embedding_after - np.amin(embedding_after))/float(np.ptp(embedding_after))
    x_normal = x_after_normal - x_before_normal
    differentiation_list.append(np.sum(np.absolute(x_normal))/float(L))

T = sum(T_list)/float(len(data))
abundance = sum(abundance_list)/float(len(data))
differentiation = sum(differentiation_list)/float(len(data))

# generate dataframe
benchmark_df = benchmark_df._append({'coding strategy': coding_strategy, 'abundance': abundance, 'differentiation': differentiation,
                                    'time cost': T, 'can be pooled?': can_be_pooled, 
                                    'positional interpretability':positional_interpretability,
                                    'functional interpretability':functional_interpretability},ignore_index=True)
print(benchmark_df)

  coding strategy    abundance  differentiation     time cost can be pooled?  \
0         One-Hot     4.000000         0.003992  7.958231e-07             No   
1        DNABert2   469.537725         9.314390  1.075537e-03            Yes   
2             GPN   512.000000         1.308631  5.871583e-03            Yes   
3           hyena   128.510978         0.523388  6.818338e-05            Yes   
4              NT  1277.445110         4.602806  1.372732e-02            Yes   
5        enformer  9501.892216         0.024141  8.842315e-02            Yes   

  positional interpretability functional interpretability  
0                         Yes                          No  
1                          No                         Yes  
2                         Yes                         Yes  
3                         Yes                         Yes  
4                         Yes                         Yes  
5                          No                         Yes  
