### generate DRSP test set

In [2]:
# generate feature dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
maxasa_dict = {
'C':167, 'D':193, 'S':155, 'Q':225, 'K':236,
'I':197, 'P':159, 'T':172, 'F':240, 'N':195,
'G':104, 'H':224, 'L':201, 'R':274, 'W':285,
'A':129, 'V':174, 'E':223, 'Y':263, 'M':224
}

eyes = np.eye(3)
ss_dict = {
'H':eyes[0],'G':eyes[0],'I':eyes[0],
'B':eyes[1],'E':eyes[1],
'T':eyes[2]
}

rootpath = '../datasets/DRSP/'

In [3]:
# calculate rASA and padding
pdb_name_list = os.listdir(rootpath + 'AF2pdb/')
pdb_name_list = [x.split('.')[0] for x in pdb_name_list]

zero_list = []
for pdb in pdb_name_list:
    try:
        with open(rootpath + 'fasta/' + pdb +'.fasta') as file:
            fasta_file = file.readlines()
            fasta_len = len(fasta_file[1])
        if(pdb in zero_list): # large protein
            ss_matrix = np.zeros([fasta_len, 3], int)
            with open(rootpath + 'SS/' + pdb + '.ss','w+') as out_file:
                np.savetxt(out_file, ss_matrix, fmt='%.1f')
            rasa_matrix = np.zeros([fasta_len, 1], float)
            with open(rootpath + 'rASA/' + pdb + '.rasa','w+') as out_file:
                np.savetxt(out_file, rasa_matrix, fmt='%.3f')
        else:
            ss_matrix = np.zeros([fasta_len, 3], int)
            rasa_matrix = np.zeros([fasta_len, 1], float)
            with open(rootpath + 'dssp/' + pdb +'.dssp') as dssp_file:
                line = dssp_file.readline()
                while line:
                    if(line.split()[0] == '#'):
                        break
                    line = dssp_file.readline()
                line = dssp_file.readline()
                index = 0
                while line:
                    if(len(line.split()) > 0):
                        SS = line[16]
                        ACC = int(line[35:38].strip())
                        AA = line[13]
                        # check ss
                        if(SS != ' '):
                            if(SS not in ss_dict.keys()):
                                ss_matrix[index][0:3] = eyes[2]
                            else:
                                ss_matrix[index][0:3] = ss_dict[SS]
                        # check rasa
                        rasa_matrix[index][0] = float(ACC)/maxasa_dict[AA]
                        index += 1                    
                    line = dssp_file.readline()
            with open(rootpath + 'SS/' + pdb + '.ss','w+') as out_file:
                np.savetxt(out_file, ss_matrix, fmt='%.1f')
            with open(rootpath + 'rASA/' + pdb + '.rasa','w+') as out_file:
                np.savetxt(out_file, rasa_matrix, fmt='%.3f')
    except IndexError:
        print(pdb)
        continue

In [4]:
# mapping pubchem fingerprint
import warnings
warnings.filterwarnings('ignore')
import pubchempy as pcp
from tqdm import tqdm

new_drug_table = pd.DataFrame(columns=['drugname', 'smile', 'molecular_weight', 'molecular_formula', 'atom', 'fingerprint', 'cactvs_fingerprint'])
drug_name_list = ['vemurafenib', 'PD0325901', 'crizotinib']
for i in tqdm(range(3)):
    drugname = drug_name_list[i]
    compound = pcp.get_compounds(drugname,'name')[0]
    try:
        smile = compound.isomeric_smiles
    except AttributeError:
        smile = np.nan
    try:
        molecular_weight = compound.molecular_weight
    except AttributeError:
        molecular_weight = np.nan   
    try:
        molecular_formula = compound.molecular_formula
    except AttributeError:
        molecular_formula = np.nan
    try: 
        atom = compound.atoms
    except AttributeError:
        atom = np.nan
    try:
        fingerprint = compound.fingerprint
    except AttributeError:
        fingerprint = np.nan
    try:
        cactvs_fingerprint = compound.cactvs_fingerprint
    except AttributeError:
        cactvs_fingerprint = np.nan
    new_drug_table = new_drug_table.append([{'drugname':drugname, 'smile':smile, 'molecular_weight':molecular_weight, 'molecular_formula':molecular_formula, 
                                    'atom':atom, 'fingerprint':fingerprint, 'cactvs_fingerprint':cactvs_fingerprint}], ignore_index=True)
print(new_drug_table)
new_drug_table.to_csv(rootpath + 'drsp_drug_table_fpfixed.csv', index=None)

100%|██████████| 3/3 [00:04<00:00,  1.34s/it]

      drugname                                              smile  \
0  vemurafenib  CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C...   
1    PD0325901  C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOC[C@...   
2   crizotinib  C[C@H](C1=C(C=CC(=C1Cl)F)Cl)OC2=C(N=CC(=C2)C3=...   

  molecular_weight molecular_formula  \
0            489.9   C23H18ClF2N3O3S   
1           482.19     C16H14F3IN2O4   
2            450.3     C21H22Cl2FN5O   

                                                atom  \
0  [Atom(1, Cl), Atom(2, S), Atom(3, F), Atom(4, ...   
1  [Atom(1, I), Atom(2, F), Atom(3, F), Atom(4, F...   
2  [Atom(1, Cl), Atom(2, Cl), Atom(3, F), Atom(4,...   

                                         fingerprint  \
0  00000371E07B3180440000000000000000000000000160...   
1  00000371C07B3980000200000000000000000000000000...   
2  00000371E07BA100060000000000000000000000000160...   

                                  cactvs_fingerprint  
0  1110000001111011001100011000000001000100000000...  
1 




In [5]:
# smile dict from DeepDTA (source: https://github.com/hkmztrk/DeepDTA/blob/master/source/datahelper.py)
CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, 
				"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, 
				"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, 
				"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, 
				"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, 
				"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, 
				"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, 
				"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}

# encode smile
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
new_drug_table = pd.read_csv(rootpath + 'drsp_drug_table_fpfixed.csv')
new_drug_table['smile_array'] = 0
new_drug_table['smile_array'] = new_drug_table['smile_array'].astype(object)
for i in tqdm(range(new_drug_table.shape[0])):
    drugname = new_drug_table['drugname'][i]
    smile = new_drug_table['smile'][i]
    smile_num_list = []
    for strr in smile:
        smile_num_list.append(CHARISOSMISET[strr])
    #smile_num = np.array(smile_num_list)
    #new_drug_table['smile_array'][i] = new_drug_table['smile_array'][i].apply(lambda x: smile_num_list)
    new_drug_table.loc[:,'smile_array'].loc[i] = np.array(smile_num_list)
print(new_drug_table)
new_drug_table.to_pickle(rootpath + 'drsp_drug_table_fpfixed_smilenum.pkl')

100%|██████████| 3/3 [00:00<00:00, 56.77it/s]

      drugname                                              smile  \
0  vemurafenib  CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C...   
1    PD0325901  C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOC[C@...   
2   crizotinib  C[C@H](C1=C(C=CC(=C1Cl)F)Cl)OC2=C(N=CC(=C2)C3=...   

   molecular_weight molecular_formula  \
0            489.90   C23H18ClF2N3O3S   
1            482.19     C16H14F3IN2O4   
2            450.30     C21H22Cl2FN5O   

                                                atom  \
0  [Atom(1, Cl), Atom(2, S), Atom(3, F), Atom(4, ...   
1  [Atom(1, I), Atom(2, F), Atom(3, F), Atom(4, F...   
2  [Atom(1, Cl), Atom(2, Cl), Atom(3, F), Atom(4,...   

                                         fingerprint  \
0  00000371E07B3180440000000000000000000000000160...   
1  00000371C07B3980000200000000000000000000000000...   
2  00000371E07BA100060000000000000000000000000160...   

                                  cactvs_fingerprint  \
0  1110000001111011001100011000000001000100000000...




In [1]:
# label processing
label_list = [0, 0, 0]

In [4]:
# onehot coding dict
import numpy as np
eyes = np.eye(20)
protein_dict = {'C':eyes[0], 'D':eyes[1], 'S':eyes[2], 'Q':eyes[3], 'K':eyes[4],
    'I':eyes[5], 'P':eyes[6], 'T':eyes[7], 'F':eyes[8], 'N':eyes[9],
    'G':eyes[10], 'H':eyes[11], 'L':eyes[12], 'R':eyes[13], 'W':eyes[14],
    'A':eyes[15], 'V':eyes[16], 'E':eyes[17], 'Y':eyes[18], 'M':eyes[19]}
fingerprint_dict = {'1':1,'0':0}

In [5]:
# feature coding & dataset generate
import warnings
warnings.filterwarnings('ignore')

window_len = 30

drsp_evidence = pd.read_pickle(rootpath + 'drsp_drug_table_fpfixed_smilenum.pkl')
dataset_feature = pd.DataFrame(columns=['gene', 'uniprotac', 'variant', 'drug', 
                                        'smile', 'smile_num' ,'cactvs_fingerprint', 'molecular_weight', # drug features
                                        'onehot_before', 'onehot_after', 'hhm_before', 'hhm_after', 'ss', 'rasa', # sequence features
                                        'label', 'source'])

In [14]:
# evidence 1: 
gene = 'BRAF'
uniprotac = 'P15056'
variant = 'L505H'
drug = drsp_evidence['drugname'][0]
smile = drsp_evidence['smile'][0]
smile_num = drsp_evidence['smile_array'][0]
cactvs_fingerprint_str = drsp_evidence['cactvs_fingerprint'][0]
molecular_weight = drsp_evidence['molecular_weight'][0]
source = 'DRSP'
label = 0

smile_array = np.zeros(shape=(85,))
for j in range(min(85, smile_num.shape[0])):
    smile_array[j] = smile_num[j]
cactvs_fingerprint = []
for strr in cactvs_fingerprint_str:
    cactvs_fingerprint.append(fingerprint_dict[strr])
cactvs_fingerprint = np.array(cactvs_fingerprint)

# sequence features from file
pos = int(variant[1:-1])
pos_after = variant[-1]

# fasta before
with open(rootpath + 'fasta/' + uniprotac + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    seq_len = len(fasta_full)
    onehot_before = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_before.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_before.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_before.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_before.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_before.append(protein_dict[strr])
onehot_before = np.array(onehot_before)

# fasta after
with open(rootpath + 'fasta/' + uniprotac + '_' + variant + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    #fasta_after = fasta_full[pos-window_len-1:pos+window_len]
    onehot_after = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_after.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_after.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_after.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_after.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_after.append(protein_dict[strr])
fasta_len = len(fasta_full)
onehot_after = np.array(onehot_after)

# hhm before
with open(rootpath + 'hhm/' + uniprotac + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                               
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_before_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_before_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_before_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_before = hhm_before_array.tolist()

# hhm after
with open(rootpath + 'hhm/' + uniprotac + '_' + variant + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                                        
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_after_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_after_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_after_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_after = hhm_after_array.tolist()

# rasa
rasa_array = np.loadtxt(rootpath + 'rASA/' + uniprotac + '.rasa')
if(pos <= window_len - 1):
    padding = np.zeros(window_len-pos+1)
    rasa_array = np.append(padding,rasa_array[0:pos+window_len])
elif(seq_len - pos < window_len):
    padding = np.zeros(window_len-seq_len+pos)
    rasa_array = np.append(rasa_array[pos-window_len-1:],padding)
else:
    rasa_array = rasa_array[pos-window_len-1:pos+window_len]
#rasa = rasa_array.tolist()
# ss
ss_array = np.loadtxt(rootpath + 'SS/' + uniprotac + '.ss')
if(pos <= window_len - 1):
    padding = np.zeros(shape=[window_len-pos+1,3])
    ss_array = np.vstack((padding,ss_array[0:pos+window_len]))
elif(seq_len - pos < window_len):
    padding = np.zeros(shape=[window_len-seq_len+pos,3])
    ss_array = np.vstack((ss_array[pos-window_len-1:],padding))
else:
    ss_array = ss_array[pos-window_len-1:pos+window_len, :]
#ss = ss_array.tolist()

dataset_feature = dataset_feature.append([{'gene': gene, 'uniprotac': uniprotac, 'variant': variant, 'drug': drug, 
                                        'smile': smile, 'smile_num':smile_array, 'cactvs_fingerprint': cactvs_fingerprint, 'molecular_weight': molecular_weight, 
                                        'onehot_before':onehot_before, 'onehot_after':onehot_after, 
                                        'hhm_before': hhm_before_array, 'hhm_after': hhm_after_array, 'ss': ss_array, 'rasa': rasa_array, 
                                        'label': label, 'source':source}], ignore_index=True)

In [15]:
# evidence 2: 
gene = 'MAP2K2'
uniprotac = 'P36507'
variant = 'V215E'
drug = drsp_evidence['drugname'][1]
smile = drsp_evidence['smile'][1]
smile_num = drsp_evidence['smile_array'][1]
cactvs_fingerprint = drsp_evidence['cactvs_fingerprint'][1]
molecular_weight = drsp_evidence['molecular_weight'][1]
source = 'DRSP'
label = 0

smile_array = np.zeros(shape=(85,))
for j in range(min(85, smile_num.shape[0])):
    smile_array[j] = smile_num[j]
cactvs_fingerprint = []
for strr in cactvs_fingerprint_str:
    cactvs_fingerprint.append(fingerprint_dict[strr])
cactvs_fingerprint = np.array(cactvs_fingerprint)

# sequence features from file
pos = int(variant[1:-1])
pos_after = variant[-1]

# fasta before
with open(rootpath + 'fasta/' + uniprotac + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    seq_len = len(fasta_full)
    onehot_before = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_before.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_before.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_before.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_before.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_before.append(protein_dict[strr])
onehot_before = np.array(onehot_before)

# fasta after
with open(rootpath + 'fasta/' + uniprotac + '_' + variant + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    #fasta_after = fasta_full[pos-window_len-1:pos+window_len]
    onehot_after = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_after.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_after.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_after.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_after.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_after.append(protein_dict[strr])
fasta_len = len(fasta_full)
onehot_after = np.array(onehot_after)

# hhm before
with open(rootpath + 'hhm/' + uniprotac + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                               
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_before_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_before_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_before_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_before = hhm_before_array.tolist()

# hhm after
with open(rootpath + 'hhm/' + uniprotac + '_' + variant + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                                        
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_after_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_after_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_after_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_after = hhm_after_array.tolist()

# rasa
rasa_array = np.loadtxt(rootpath + 'rASA/' + uniprotac + '.rasa')
if(pos <= window_len - 1):
    padding = np.zeros(window_len-pos+1)
    rasa_array = np.append(padding,rasa_array[0:pos+window_len])
elif(seq_len - pos < window_len):
    padding = np.zeros(window_len-seq_len+pos)
    rasa_array = np.append(rasa_array[pos-window_len-1:],padding)
else:
    rasa_array = rasa_array[pos-window_len-1:pos+window_len]
#rasa = rasa_array.tolist()
# ss
ss_array = np.loadtxt(rootpath + 'SS/' + uniprotac + '.ss')
if(pos <= window_len - 1):
    padding = np.zeros(shape=[window_len-pos+1,3])
    ss_array = np.vstack((padding,ss_array[0:pos+window_len]))
elif(seq_len - pos < window_len):
    padding = np.zeros(shape=[window_len-seq_len+pos,3])
    ss_array = np.vstack((ss_array[pos-window_len-1:],padding))
else:
    ss_array = ss_array[pos-window_len-1:pos+window_len, :]
#ss = ss_array.tolist()

dataset_feature = dataset_feature.append([{'gene': gene, 'uniprotac': uniprotac, 'variant': variant, 'drug': drug, 
                                        'smile': smile, 'smile_num':smile_array, 'cactvs_fingerprint': cactvs_fingerprint, 'molecular_weight': molecular_weight, 
                                        'onehot_before':onehot_before, 'onehot_after':onehot_after, 
                                        'hhm_before': hhm_before_array, 'hhm_after': hhm_after_array, 'ss': ss_array, 'rasa': rasa_array, 
                                        'label': label, 'source':source}], ignore_index=True)


In [17]:
# evidence 3: 
gene = 'ROS1'
uniprotac = 'P08922'
variant = 'G2032R'
drug = drsp_evidence['drugname'][2]
smile = drsp_evidence['smile'][2]
smile_num = drsp_evidence['smile_array'][2]
cactvs_fingerprint = drsp_evidence['cactvs_fingerprint'][2]
molecular_weight = drsp_evidence['molecular_weight'][2]
source = 'DRSP'
label = 0

smile_array = np.zeros(shape=(85,))
for j in range(min(85, smile_num.shape[0])):
    smile_array[j] = smile_num[j]
cactvs_fingerprint = []
for strr in cactvs_fingerprint_str:
    cactvs_fingerprint.append(fingerprint_dict[strr])
cactvs_fingerprint = np.array(cactvs_fingerprint)

# sequence features from file
pos = int(variant[1:-1])
pos_after = variant[-1]

# fasta before
with open(rootpath + 'fasta/' + uniprotac + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    seq_len = len(fasta_full)
    onehot_before = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_before.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_before.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_before.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_before.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_before.append(protein_dict[strr])
onehot_before = np.array(onehot_before)

# fasta after
with open(rootpath + 'fasta/' + uniprotac + '_' + variant + '.fasta') as file:
    fasta_file = file.readlines()
    fasta_full = fasta_file[1]
    #fasta_after = fasta_full[pos-window_len-1:pos+window_len]
    onehot_after = []
    if(pos <= window_len - 1):
        for j in range(window_len-pos+1): # padding head
            onehot_after.append(np.zeros(20))
        for strr in fasta_full[0:pos+window_len]:
            onehot_after.append(protein_dict[strr])
    elif(seq_len - pos < window_len):
        for strr in fasta_full[pos-window_len-1:]:
            onehot_after.append(protein_dict[strr])
        for j in range(window_len-seq_len+pos): # padding end
            onehot_after.append(np.zeros(20))
    else:        
        for strr in fasta_full[pos-window_len-1:pos+window_len]:
            onehot_after.append(protein_dict[strr])
fasta_len = len(fasta_full)
onehot_after = np.array(onehot_after)

# hhm before
with open(rootpath + 'hhm/' + uniprotac + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                               
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_before_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_before_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_before_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_before = hhm_before_array.tolist()

# hhm after
with open(rootpath + 'hhm/' + uniprotac + '_' + variant + '.hhm') as hhm_file:     
    hhm_matrix = np.zeros([fasta_len, 30], float)
    hhm_line = hhm_file.readline()
    idxx = 0
    while(hhm_line[0] != '#'):
        hhm_line = hhm_file.readline()
    for i in range(0,5):
        hhm_line = hhm_file.readline()
    while hhm_line:
        if(len(hhm_line.split()) == 23):
            idxx += 1
            if(idxx == fasta_len + 1):
                break
            each_item = hhm_line.split()[2:22]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                            
            for j in range(0, 20):
                hhm_matrix[idxx - 1, j] = int(each_item[j])
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j])/2000))                                              
        elif(len(hhm_line.split()) == 10):
            each_item = hhm_line.split()[0:10]
            for idx, s in enumerate(each_item):
                if(s == '*'):
                    each_item[idx] = '99999'                             
            for j in range(20, 30):
                hhm_matrix[idxx - 1, j] = int(each_item[j - 20]) 
                #hhm_matrix[idxx - 1, j] = 10/(1 + math.exp(-1 * int(each_item[j - 20])/2000))                                                                        
        hhm_line = hhm_file.readline()
    if(pos <= window_len - 1):
        padding = np.zeros(shape=[window_len-pos+1,30])
        hhm_after_array = np.vstack((padding,hhm_matrix[0:pos+window_len, :])) 
    elif(seq_len - pos < window_len):
        padding = np.zeros(shape=[window_len-seq_len+pos,30])
        hhm_after_array = np.vstack((hhm_matrix[pos-window_len-1:, :],padding))
    else:
        hhm_after_array = hhm_matrix[pos-window_len-1:pos+window_len, :]
    #hhm_after = hhm_after_array.tolist()

# rasa
rasa_array = np.loadtxt(rootpath + 'rASA/' + uniprotac + '.rasa')
if(pos <= window_len - 1):
    padding = np.zeros(window_len-pos+1)
    rasa_array = np.append(padding,rasa_array[0:pos+window_len])
elif(seq_len - pos < window_len):
    padding = np.zeros(window_len-seq_len+pos)
    rasa_array = np.append(rasa_array[pos-window_len-1:],padding)
else:
    rasa_array = rasa_array[pos-window_len-1:pos+window_len]
#rasa = rasa_array.tolist()
# ss
ss_array = np.loadtxt(rootpath + 'SS/' + uniprotac + '.ss')
if(pos <= window_len - 1):
    padding = np.zeros(shape=[window_len-pos+1,3])
    ss_array = np.vstack((padding,ss_array[0:pos+window_len]))
elif(seq_len - pos < window_len):
    padding = np.zeros(shape=[window_len-seq_len+pos,3])
    ss_array = np.vstack((ss_array[pos-window_len-1:],padding))
else:
    ss_array = ss_array[pos-window_len-1:pos+window_len, :]
#ss = ss_array.tolist()

dataset_feature = dataset_feature.append([{'gene': gene, 'uniprotac': uniprotac, 'variant': variant, 'drug': drug, 
                                        'smile': smile, 'smile_num':smile_array, 'cactvs_fingerprint': cactvs_fingerprint, 'molecular_weight': molecular_weight, 
                                        'onehot_before':onehot_before, 'onehot_after':onehot_after, 
                                        'hhm_before': hhm_before_array, 'hhm_after': hhm_after_array, 'ss': ss_array, 'rasa': rasa_array, 
                                        'label': label, 'source':source}], ignore_index=True)

In [18]:
dataset_feature

Unnamed: 0,gene,uniprotac,variant,drug,smile,smile_num,cactvs_fingerprint,molecular_weight,onehot_before,onehot_after,hhm_before,hhm_after,ss,rasa,label,source
0,BRAF,P15056,L505H,vemurafenib,CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C...,"[42.0, 42.0, 42.0, 49.0, 1.0, 40.0, 48.0, 31.0...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...",489.9,"[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...","[[4985.0, 6446.0, 4990.0, 3375.0, 5454.0, 5675...","[[4975.0, 6741.0, 5123.0, 3372.0, 5453.0, 5641...","[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[0.449, 0.182, 0.768, 0.375, 0.285, 0.0, 0.085...",0,DRSP
1,MAP2K2,P36507,V215E,PD0325901,C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOC[C@...,"[42.0, 35.0, 40.0, 42.0, 42.0, 1.0, 40.0, 42.0...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...",482.19,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[5049.0, 6722.0, 7112.0, 5144.0, 7659.0, 6624...","[[5118.0, 7134.0, 6991.0, 5129.0, 7256.0, 7172...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[0.274, 0.502, 0.432, 0.156, 0.64, 0.102, 0.19...",0,DRSP
2,ROS1,P08922,G2032R,crizotinib,C[C@H](C1=C(C=CC(=C1Cl)F)Cl)OC2=C(N=CC(=C2)C3=...,"[42.0, 53.0, 42.0, 8.0, 12.0, 54.0, 1.0, 42.0,...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...",450.3,"[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[3071.0, 6101.0, 5203.0, 4504.0, 6802.0, 4387...","[[3023.0, 6237.0, 5214.0, 4457.0, 6729.0, 4350...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, ...","[0.335, 0.369, 0.046, 0.61, 0.156, 0.491, 0.02...",0,DRSP


In [20]:
print(dataset_feature.shape)
print(dataset_feature)

(3, 16)
     gene uniprotac variant         drug  \
0    BRAF    P15056   L505H  vemurafenib   
1  MAP2K2    P36507   V215E    PD0325901   
2    ROS1    P08922  G2032R   crizotinib   

                                               smile  \
0  CCCS(=O)(=O)NC1=C(C(=C(C=C1)F)C(=O)C2=CNC3=C2C...   
1  C1=CC(=C(C=C1I)F)NC2=C(C=CC(=C2F)F)C(=O)NOC[C@...   
2  C[C@H](C1=C(C=CC(=C1Cl)F)Cl)OC2=C(N=CC(=C2)C3=...   

                                           smile_num  \
0  [42.0, 42.0, 42.0, 49.0, 1.0, 40.0, 48.0, 31.0...   
1  [42.0, 35.0, 40.0, 42.0, 42.0, 1.0, 40.0, 42.0...   
2  [42.0, 53.0, 42.0, 8.0, 12.0, 54.0, 1.0, 42.0,...   

                                  cactvs_fingerprint  molecular_weight  \
0  [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...            489.90   
1  [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...            482.19   
2  [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...            450.30   

                                       onehot_before  \
0  [[0.0, 0.

In [21]:
dataset_feature.to_pickle(rootpath + 'drsp_dataset_featurecode.dataset')