In [5]:
import os
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = '7'
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

from MAESeqModule.MAESeq_utils import dataloader,get_dict, seq_data_to_onehot
import numpy as np
DICT_EXIST = True
DICT_PATH_CHAR2INT = 'dict/char2int.npy'
DICT_PATH_INT2CHAR = 'dict/int2char.npy'
DATA_PATH = 'dataset/scop_fa_represeq_lib_latest.fa'

seq_list, max_len = dataloader(
    file= DATA_PATH,
    len_data=10000, max_len_percintile=75)
max_len = 300 # 在这里统一一下

if DICT_EXIST:
    dict_char2int = np.load(DICT_PATH_CHAR2INT, allow_pickle = True).item()
    dict_int2char = np.load(DICT_PATH_INT2CHAR,allow_pickle = True).item()
else:
    dict_char2int, dict_int2char = get_dict(seq_list)
    np.save(DICT_PATH_CHAR2INT, dict_char2int)
    np.save(DICT_PATH_INT2CHAR, dict_int2char)

onehot_data = seq_data_to_onehot(seq_list,dict_char2int,max_len)
dimension = len(dict_char2int)

print("Length of seq is {}".format(max_len))
print("Dimension is {}".format(dimension))


Length of seq is 300
Dimension is 25


In [6]:
import numpy as np
onehot_train, onehot_val,onehot_test =np.split(onehot_data,[
        int(len(onehot_data)*0.8), 
        int(len(onehot_data)*0.95)])
print(onehot_train.shape)
print(onehot_val.shape)
print(onehot_test.shape)



(8000, 300, 25)
(1500, 300, 25)
(500, 300, 25)


In [7]:
from MAESeqModule.MAESeq_utils import mask_onehot_matrix

onehot_train_mask = mask_onehot_matrix(onehot_train, 0)

i = 40
print(np.sum(onehot_train[i]))
print(np.sum(onehot_train_mask[i]))

117.0
117.0


In [8]:
from MAESeqModule.MAESeq_model import AutoencoderGRU, ReconstructRateVaried,my_loss_entropy
import keras
import tensorflow as tf

autoencoder = keras.models.load_model('/geniusland/home/qufuchuan/trained_model/',custom_objects={
    'my_loss_entropy':my_loss_entropy, 
    'ReconstructRateVaried':ReconstructRateVaried})


2023-03-01 17:36:38.831923: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-01 17:36:41.380621: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 51342 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:c2:00.0, compute capability: 8.0


In [11]:
# from MAESeqModule.MAESeq_utils import mask_onehot_matrix, evaluate_per_mask_rate
import numpy as np
import pandas as pd
def mask_onehot_matrix(onehot_data, mask_rate = 0.2):
    # To input the whole data
    len_data = onehot_data.shape[0]
    res = onehot_data.copy()

    val_len = np.sum(np.sum(onehot_data, axis=2),axis=1)
    mask_len = (val_len * mask_rate).astype(np.int32)
    right_limit = val_len-mask_len

    start_point = np.random.randint(right_limit)
    end_point = start_point+mask_len
    
    # len_seq = onehot_data.shape[1]
    # len_mask = int(mask_rate*len_seq)
    # for single_matrix in res:
    #     mask_choose = np.random.choice(len_seq,len_mask,replace=False)
    #     single_matrix[mask_choose,:]=0
    for _ in range(len_data):
        res[_,start_point[_]:end_point[_],:] = 0.
    return res

def evaluate_per_mask_rate(onehot_test, autoencoder):
    mask_rates = np.linspace(0,1,21)[:-1]
    res = pd.Series(dtype=pd.Float64Dtype)
    for rate in mask_rates:
        print('start evaluate mask rate = %.2f'%rate)
        onehot_test_mask = mask_onehot_matrix(onehot_test, rate)
        test_res = autoencoder.predict(onehot_test_mask)
        reconst_rate = ReconstructRateVaried(onehot_test, test_res)
        # reconst_rate = rate
        res['Mask '+'%.2f'%rate] = float(reconst_rate)
    return res

res = evaluate_per_mask_rate(autoencoder=autoencoder, onehot_test = onehot_test)
res



start evaluate mask rate = 0.00
start evaluate mask rate = 0.05
start evaluate mask rate = 0.10
start evaluate mask rate = 0.15
start evaluate mask rate = 0.20
start evaluate mask rate = 0.25
start evaluate mask rate = 0.30
start evaluate mask rate = 0.35
start evaluate mask rate = 0.40
start evaluate mask rate = 0.45
start evaluate mask rate = 0.50
start evaluate mask rate = 0.55
start evaluate mask rate = 0.60
start evaluate mask rate = 0.65
start evaluate mask rate = 0.70
start evaluate mask rate = 0.75
start evaluate mask rate = 0.80
start evaluate mask rate = 0.85
start evaluate mask rate = 0.90
start evaluate mask rate = 0.95


Mask 0.00    1.000000
Mask 0.05    0.941650
Mask 0.10    0.889630
Mask 0.15    0.842708
Mask 0.20    0.795932
Mask 0.25    0.750709
Mask 0.30    0.704728
Mask 0.35    0.659785
Mask 0.40    0.613877
Mask 0.45    0.568385
Mask 0.50    0.521059
Mask 0.55    0.477462
Mask 0.60    0.431603
Mask 0.65    0.385548
Mask 0.70    0.341840
Mask 0.75    0.295700
Mask 0.80    0.251112
Mask 0.85    0.205021
Mask 0.90    0.160189
Mask 0.95    0.115870
dtype: float64