### civic feature engineering

In [1]:
import pandas as pd
import numpy as np

In [3]:
# split fasta
civic_gene_table = pd.read_csv('../datasets/middlefile/civic_gene_table_fixed.csv')
for i in range(civic_gene_table.shape[0]):
    uniprotac = civic_gene_table['uniprotac'][i]
    fasta = civic_gene_table['fasta'][i]
    with open('../datasets/middlefile/fasta/' + uniprotac + '.fasta', 'w+') as out:
        out.write('>' + uniprotac + '\n')
        out.write(fasta)

In [2]:
pharmgkb_gene_table = pd.read_csv('../datasets/middlefile/pharmgkb_gene_table_fixed.csv')
for i in range(pharmgkb_gene_table.shape[0]):
    uniprotac = pharmgkb_gene_table['uniprotac'][i]
    fasta = pharmgkb_gene_table['fasta'][i]
    with open('../datasets/middlefile/fasta/' + uniprotac + '.fasta', 'w+') as out:
        out.write('>' + uniprotac + '\n')
        out.write(fasta)

In [5]:
# alphafold database fetch
import requests
for i in range(civic_gene_table.shape[0]):
    uniprotac = civic_gene_table['uniprotac'][i]
    url = 'https://alphafold.ebi.ac.uk/files/AF-' + uniprotac + '-F1-model_v4.pdb'
    f = requests.get(url)
    with open('../datasets/middlefile/AF2pdb/' + uniprotac + '.pdb', 'wb') as out:
        out.write(f.content)

In [3]:
import requests
for i in range(pharmgkb_gene_table.shape[0]):
    uniprotac = pharmgkb_gene_table['uniprotac'][i]
    url = 'https://alphafold.ebi.ac.uk/files/AF-' + uniprotac + '-F1-model_v4.pdb'
    f = requests.get(url)
    with open('../datasets/middlefile/AF2pdb/' + uniprotac + '.pdb', 'wb') as out:
        out.write(f.content)

In [6]:
# generage fasta with mutation
civic_evidence_table = pd.read_csv('../datasets/middlefile/civic_evidence_table.csv')
for i in range(civic_evidence_table.shape[0]):
    gene = civic_evidence_table['gene'][i]
    variant = civic_evidence_table['variant'][i]
    fasta = civic_gene_table[civic_gene_table['gene'] == gene]['fasta'].values[0]
    pos = int(variant[1:-1])
    pos_after = variant[-1]
    fasta = fasta[:pos-1] + pos_after + fasta[pos:]
    with open('../datasets/middlefile/fasta/' + gene + '_' + variant + '.fasta', 'w+') as out:
        out.write('>' + gene + '_' + variant + '\n')
        out.write(fasta)

In [4]:
pharmgkb_evidence_table = pd.read_csv('../datasets/middlefile/pharmgkb_evidence_table.csv')
for i in range(pharmgkb_evidence_table.shape[0]):
    gene = pharmgkb_evidence_table['gene'][i]
    variant = pharmgkb_evidence_table['variant'][i]
    fasta = pharmgkb_gene_table[pharmgkb_gene_table['gene'] == gene]['fasta'].values[0]
    pos = int(variant[1:-1])
    pos_after = variant[-1]
    fasta = fasta[:pos-1] + pos_after + fasta[pos:]
    with open('../datasets/middlefile/fasta/' + gene + '_' + variant + '.fasta', 'w+') as out:
        out.write('>' + gene + '_' + variant + '\n')
        out.write(fasta)

In [None]:
# calculate rASA and padding
zero_list = ['P51587', 'Q13315', 'P04114', 'O60673', 'P21817', 'P42858', 'P98164', 'Q8WXI7', 'Q9H251']


In [8]:
# mapping pubchem fingerprint
import warnings
warnings.filterwarnings('ignore')
import pubchempy as pcp
from tqdm import tqdm

drug_table = pd.read_csv('../datasets/middlefile/civic_drug_table.csv')
new_drug_table = pd.DataFrame(columns=['drugname', 'smile', 'molecular_weight', 'molecular_formula', 'atom', 'fingerprint', 'cactvs_fingerprint'])
for i in tqdm(range(drug_table.shape[0])):
    drugname = drug_table['drugname'][i]
    compound = pcp.get_compounds(drugname,'name')[0]
    try:
        smile = compound.isomeric_smiles
    except AttributeError:
        smile = np.nan
    try:
        molecular_weight = compound.molecular_weight
    except AttributeError:
        molecular_weight = np.nan   
    try:
        molecular_formula = compound.molecular_formula
    except AttributeError:
        molecular_formula = np.nan
    try: 
        atom = compound.atoms
    except AttributeError:
        atom = np.nan
    try:
        fingerprint = compound.fingerprint
    except AttributeError:
        fingerprint = np.nan
    try:
        cactvs_fingerprint = compound.cactvs_fingerprint
    except AttributeError:
        cactvs_fingerprint = np.nan
    new_drug_table = new_drug_table.append([{'drugname':drugname, 'smile':smile, 'molecular_weight':molecular_weight, 'molecular_formula':molecular_formula, 
                                    'atom':atom, 'fingerprint':fingerprint, 'cactvs_fingerprint':cactvs_fingerprint}], ignore_index=True)
print(new_drug_table)
new_drug_table.to_csv('../datasets/middlefile/civic_drug_table_fpfixed.csv', index=None)

100%|██████████| 97/97 [02:11<00:00,  1.36s/it]

            drugname                                              smile  \
0        Selumetinib  CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...   
1           Imatinib  CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...   
2          Sorafenib  CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC...   
3         Fedratinib  CC1=CN=C(N=C1NC2=CC(=CC=C2)S(=O)(=O)NC(C)(C)C)...   
4       Tanespimycin  C[C@H]1C[C@@H]([C@@H]([C@H](/C=C(/[C@@H]([C@H]...   
..               ...                                                ...   
92  Arsenic Trioxide                    [O-2].[O-2].[O-2].[As+3].[As+3]   
93        Venetoclax  CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...   
94      Gilteritinib  CCC1=C(N=C(C(=N1)C(=O)N)NC2=CC(=C(C=C2)N3CCC(C...   
95      Tazemetostat  CCN(C1CCOCC1)C2=CC(=CC(=C2C)C(=O)NCC3=C(C=C(NC...   
96       Pemigatinib  CCN1C2=C3C=C(NC3=NC=C2CN(C1=O)C4=C(C(=CC(=C4F)...   

   molecular_weight molecular_formula  \
0             457.7   C17H15BrClFN4O3   
1             493




In [3]:
# smile dict from DeepDTA (source: https://github.com/hkmztrk/DeepDTA/blob/master/source/datahelper.py)
CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, 
				"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, 
				"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, 
				"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, 
				"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, 
				"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, 
				"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, 
				"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}

In [5]:
# encode smile
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
new_drug_table = pd.read_csv('../datasets/middlefile/civic_drug_table_fpfixed.csv')
new_drug_table['smile_array'] = 0
new_drug_table['smile_array'] = new_drug_table['smile_array'].astype(object)
for i in tqdm(range(new_drug_table.shape[0])):
    drugname = new_drug_table['drugname'][i]
    smile = new_drug_table['smile'][i]
    smile_num_list = []
    for strr in smile:
        smile_num_list.append(CHARISOSMISET[strr])
    #smile_num = np.array(smile_num_list)
    #new_drug_table['smile_array'][i] = new_drug_table['smile_array'][i].apply(lambda x: smile_num_list)
    new_drug_table.loc[:,'smile_array'].loc[i] = smile_num_list
print(new_drug_table)
new_drug_table.to_csv('../datasets/middlefile/civic_drug_table_fpfixed_smilenum.csv', index=None)

100%|██████████| 97/97 [00:00<00:00, 3469.83it/s]

            drugname                                              smile  \
0        Selumetinib  CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...   
1           Imatinib  CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...   
2          Sorafenib  CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC...   
3         Fedratinib  CC1=CN=C(N=C1NC2=CC(=CC=C2)S(=O)(=O)NC(C)(C)C)...   
4       Tanespimycin  C[C@H]1C[C@@H]([C@@H]([C@H](/C=C(/[C@@H]([C@H]...   
..               ...                                                ...   
92  Arsenic Trioxide                    [O-2].[O-2].[O-2].[As+3].[As+3]   
93        Venetoclax  CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...   
94      Gilteritinib  CCC1=C(N=C(C(=N1)C(=O)N)NC2=CC(=C(C=C2)N3CCC(C...   
95      Tazemetostat  CCN(C1CCOCC1)C2=CC(=CC(=C2C)C(=O)NCC3=C(C=C(NC...   
96       Pemigatinib  CCN1C2=C3C=C(NC3=NC=C2CN(C1=O)C4=C(C(=CC(=C4F)...   

    molecular_weight molecular_formula  \
0            457.700   C17H15BrClFN4O3   
1            49




In [6]:
# encode smile
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
new_drug_table = pd.read_csv('../datasets/middlefile/pharmgkb_drug_table_fpfixed.csv')
new_drug_table['smile_array'] = 0
new_drug_table['smile_array'] = new_drug_table['smile_array'].astype(object)
for i in tqdm(range(new_drug_table.shape[0])):
    drugname = new_drug_table['drugname'][i]
    smile = new_drug_table['smile'][i]
    smile_num_list = []
    for strr in smile:
        smile_num_list.append(CHARISOSMISET[strr])
    #smile_num = np.array(smile_num_list)
    #new_drug_table['smile_array'][i] = new_drug_table['smile_array'][i].apply(lambda x: smile_num_list)
    new_drug_table.loc[:,'smile_array'].loc[i] = smile_num_list
print(new_drug_table)
new_drug_table.to_csv('../datasets/middlefile/pharmgkb_drug_table_fpfixed_smilenum.csv', index=None)

100%|██████████| 223/223 [00:00<00:00, 2693.92it/s]

         drugname                                              smile  \
0    capecitabine  CCCCCOC(=O)NC1=NC(=O)N(C=C1F)[C@H]2[C@@H]([C@@...   
1    fluorouracil                               C1=C(C(=O)NC(=O)N1)F   
2        warfarin       CC(=O)CC(C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O   
3       gefitinib  COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...   
4       efavirenz   C1CC1C#C[C@]2(C3=C(C=CC(=C3)Cl)NC(=O)O2)C(F)(F)F   
..            ...                                                ...   
218      thiotepa                             C1CN1P(=S)(N2CC2)N3CC3   
219      levodopa                  C1=CC(=C(C=C1C[C@@H](C(=O)O)N)O)O   
220    naltrexone  C1CC1CN2CC[C@]34[C@@H]5C(=O)CC[C@]3([C@H]2CC6=...   
221    folic acid  C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=...   
222  Glucarpidase                        [Zn+2].[Zn+2].[Zn+2].[Zn+2]   

     molecular_weight molecular_formula  \
0              359.35       C15H22FN3O6   
1              130.08         C4H3FN2O2   
2     


