## Building a dataframe of fingerprints

This notebook will create a dataframe containing several types of fingerprints for 6935 molecules contained in a dataframe obtained from this research work: Meyer, J.G., Liu, S., Miller, I.J., Coon, J.J., Gitter, A., 2019. Learning Drug Functions from Chemical Structures with Convolutional Neural Networks and Random Forests. J. Chem. Inf. Model. 59, 4438–4449. https://doi.org/10.1021/acs.jcim.9b00236


#### In case the requirements.txt won't work

In [2]:
# pip install rdkit-pypi

In [3]:
# pip install PubChemPy

### Imports and loads

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from rdkit.Chem import AllChem, MACCSkeys,rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import PandasTools as pt
import pubchempy as pcp

In [3]:
# I recommend set this warning off, all operations are map functions to whole columns, so I understand there shoudn't be problems
pd.options.mode.chained_assignment = None

In [4]:
drugs_art = pd.read_csv(os.path.join('res','raw_data','CID_properties_nr.csv'))

In [5]:
drugs_art['drug_class'].unique()

array(['hematologic', 'cardio', 'antiinfective', 'cns', 'antineoplastic',
       'reproductivecontrol', 'dermatologic', 'antiinflammatory',
       'respiratorysystem', 'gastrointestinal', 'lipidregulating',
       'urological'], dtype=object)

In [9]:
drugs_art.head()

Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,XLogP,drug_class,desalted_SMILES
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,hematologic,BrCCCBr.CN(C)CCCCCCN(C)C
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,cardio,Brc1c(NC2=NCCN2)ccc2nccnc12
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,antiinfective,Brc1ccc(C2CN3CCSC3=N2)cc1
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,cns,Brc1ccc(NC2C3CC4CC(C3)CC2C4)cc1
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,antineoplastic,C#CC(N)CCC(C)N


In [72]:
drugs = pd.read_csv(os.path.join('res','raw_data','drugbank.csv'))
drugs.columns

Index(['Unnamed: 0', '@type', '@created', '@updated', 'drugbank-id', 'name',
       'cas-number', 'unii', 'average-mass', 'monoisotopic-mass', 'groups',
       'mixtures', 'manufacturers', 'experimental-properties', 'snp-effects',
       'snp-adverse-drug-reactions', 'targets', 'enzymes', 'carriers',
       'transporters', 'logP', 'logS', 'Water Solubility', 'IUPAC Name',
       'Traditional IUPAC Name', 'Molecular Weight', 'Monoisotopic Weight',
       'SMILES', 'Molecular Formula', 'InChI', 'InChIKey',
       'Polar Surface Area (PSA)', 'Refractivity', 'Polarizability',
       'Rotatable Bond Count', 'H Bond Acceptor Count', 'H Bond Donor Count',
       'pKa (strongest acidic)', 'Physiological Charge', 'Number of Rings',
       'Bioavailability', 'Rule of Five', 'Ghose Filter', 'MDDR-Like Rule',
       'atc_code', 'atc_code_0', 'atc_code_0_$', 'atc_code_1', 'atc_code_1_$',
       'atc_code_2', 'atc_code_2_$', 'atc_code_3', 'atc_code_3_$'],
      dtype='object')

In [104]:
def get_cid_from_inchi(inchi):
    """Function that obtains the CID of a molecule from its InChI
    Input: molecule's InChi
    Output: molecule's CID
    """
    try:
        comp = pcp.get_compounds(inchi, 'inchi')
    except:
        print('Something went wrong obtaining the CID')
        return None
    return comp[0].cid

In [105]:
get_cid_from_inchi('InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122-39-15-23-70(122)92(149)114-60(30-34-79(134)135)85(142)111-59(29-33-78(132)133)86(143)116-64(43-55-24-26-56(123)27-25-55)89(146)118-67(97(154)155)40-51(2)3)119-87(144)61(31-35-80(136)137)112-84(141)58(28-32-77(130)131)113-88(145)63(42-54-18-10-7-11-19-54)117-90(147)66(45-81(138)139)110-76(129)50-107-83(140)65(44-71(100)124)109-75(128)49-106-73(126)47-104-72(125)46-105-74(127)48-108-91(148)68-21-13-38-121(68)95(152)62(20-12-36-103-98(101)102)115-93(150)69-22-14-37-120(69)94(151)57(99)41-53-16-8-6-9-17-53/h6-11,16-19,24-27,51-52,57-70,82,123H,5,12-15,20-23,28-50,99H2,1-4H3,(H2,100,124)(H,104,125)(H,105,127)(H,106,126)(H,107,140)(H,108,148)(H,109,128)(H,110,129)(H,111,142)(H,112,141)(H,113,145)(H,114,149)(H,115,150)(H,116,143)(H,117,147)(H,118,146)(H,119,144)(H,130,131)(H,132,133)(H,134,135)(H,136,137)(H,138,139)(H,154,155)(H4,101,102,103)/t52-,57+,58-,59-,60-,61-,62-,63-,64-,65-,66-,67-,68-,69-,70-,82-/m0/s1')


101041682

In [101]:
sub_drugs_bank = drugs[['InChI', 'H Bond Acceptor Count', 'H Bond Donor Count', 'SMILES', 'Molecular Weight', 'logP', 'Rule of Five', 'atc_code_3', 'atc_code_3_$']]
clean_drug_bank = sub_drugs_bank[sub_drugs_bank['SMILES'].notna() & sub_drugs_bank['atc_code_3'].notna() & sub_drugs_bank['InChI'].notna()]
display(clean_drug_bank.sample(10))
print(len(clean_drug_bank))
print(clean_drug_bank.iloc[0]['InChI'])



Unnamed: 0,InChI,H Bond Acceptor Count,H Bond Donor Count,SMILES,Molecular Weight,logP,Rule of Five,atc_code_3,atc_code_3_$
7862,"InChI=1S/C19H14Cl2N2O3S/c1-27(25,26)13-6-7-14(...",4.0,1.0,CS(=O)(=O)C1=CC(Cl)=C(C=C1)C(=O)NC1=CC=C(Cl)C(...,421.297,4.22,1.0,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
10041,InChI=1S/C19H22F3N5O2S/c1-10-14(11-6-7-24-13(9...,4.0,2.0,CC1=C(SC(NC(=O)N2CCC[C@H]2C(N)=O)=N1)C1=CC(=NC...,441.47,2.76,1.0,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
1157,"InChI=1S/C11H12ClNO3S/c1-13-10(14)6-7-17(15,16...",3.0,0.0,CN1C(C2=CC=C(Cl)C=C2)S(=O)(=O)CCC1=O,273.736,0.84,1.0,M,MUSCULO-SKELETAL SYSTEM
1041,InChI=1S/C22H31NO3/c1-3-23(4-2)17-11-12-18-26-...,3.0,1.0,CCN(CC)CC#CCOC(=O)C(O)(C1CCCCC1)C1=CC=CC=C1,357.4864,4.36,1.0,G,GENITO URINARY SYSTEM AND SEX HORMONES
9450,InChI=1S/C16H19N3O5S/c1-24-9-7-25-15-11(14(21)...,6.0,3.0,[H][C@]12SCC(OC)=C(N1C(=O)[C@H]2NC(=O)[C@H](N)...,365.4,0.65,1.0,J,ANTIINFECTIVES FOR SYSTEMIC USE
8102,InChI=1S/C26H41BrNO4/c1-26(2)21-6-5-19(22(26)1...,4.0,0.0,COC1=C(OC)C=C(C[N+]2(CCOCCC3CCC4CC3C4(C)C)CCOC...,511.52,3.71,0.0,A,ALIMENTARY TRACT AND METABOLISM
11517,InChI=1S/C17H18Br2N4O2/c18-12-8-10(16(20)21)2-...,6.0,4.0,NC(=N)C1=CC=C(OCCCOC2=CC=C(C=C2Br)C(N)=N)C(Br)=C1,470.158,2.36,1.0,D,DERMATOLOGICALS
5912,InChI=1S/C11H6ClN3O6/c12-7-5(14-8(16)10(18)19)...,7.0,4.0,OC(=O)C(=O)NC1=CC(=CC(NC(=O)C(O)=O)=C1Cl)C#N,311.63,0.72,1.0,S,SENSORY ORGANS
224,InChI=1S/C12H9N3O/c1-8-11(9-2-4-14-5-3-9)6-10(...,3.0,1.0,CC1=C(C=C(C#N)C(=O)N1)C1=CC=NC=C1,211.2194,1.04,1.0,C,CARDIOVASCULAR SYSTEM
5552,InChI=1S/C38H47N5O7S2/c1-21(2)30-20-51-35(40-3...,9.0,2.0,[H][C@]12C[C@]1(NC(=O)[C@]1([H])C[C@H](C[C@@]1...,749.939,4.69,0.0,G,GENITO URINARY SYSTEM AND SEX HORMONES


2938
InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122-39-15-23-70(122)92(149)114-60(30-34-79(134)135)85(142)111-59(29-33-78(132)133)86(143)116-64(43-55-24-26-56(123)27-25-55)89(146)118-67(97(154)155)40-51(2)3)119-87(144)61(31-35-80(136)137)112-84(141)58(28-32-77(130)131)113-88(145)63(42-54-18-10-7-11-19-54)117-90(147)66(45-81(138)139)110-76(129)50-107-83(140)65(44-71(100)124)109-75(128)49-106-73(126)47-104-72(125)46-105-74(127)48-108-91(148)68-21-13-38-121(68)95(152)62(20-12-36-103-98(101)102)115-93(150)69-22-14-37-120(69)94(151)57(99)41-53-16-8-6-9-17-53/h6-11,16-19,24-27,51-52,57-70,82,123H,5,12-15,20-23,28-50,99H2,1-4H3,(H2,100,124)(H,104,125)(H,105,127)(H,106,126)(H,107,140)(H,108,148)(H,109,128)(H,110,129)(H,111,142)(H,112,141)(H,113,145)(H,114,149)(H,115,150)(H,116,143)(H,117,147)(H,118,146)(H,119,144)(H,130,131)(H,132,133)(H,134,135)(H,136,137)(H,138,139)(H,154,155)(H4,101,102,103)/t52-,57+,58-,59-,60-,61-,62-,63-,64-,65-,66-,67-,68-,69-,70-,82-/m0/s1


In [106]:
clean_drug_bank['CID']= clean_drug_bank['InChI'].map(get_cid_from_inchi)
clean_drug_bank.to_csv(os.path.join('res','raw_data','drugbank_cured.csv'))
display(clean_drug_bank.sample(10))

Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID
Something went wrong obtaining the CID


Unnamed: 0,InChI,H Bond Acceptor Count,H Bond Donor Count,SMILES,Molecular Weight,logP,Rule of Five,atc_code_3,atc_code_3_$,CID
8128,InChI=1S/CN2.Ca/c2-1-3;/q-2;+2,2.0,0.0,[Ca++].[N-]=C=[N-],80.103,0.04,1.0,N,NERVOUS SYSTEM,56955933.0
10205,InChI=1S/C19H29N5O2/c1-19(2)14-16(25)24(17(26)...,6.0,0.0,CC1(C)CC(=O)N(CCCCN2CCN(CC2)C2=NC=CC=N2)C(=O)C1,359.474,2.09,1.0,N,NERVOUS SYSTEM,55191.0
11191,InChI=1S/C14H19N3O/c1-3-17(4-2)11-10-13-15-14(...,3.0,0.0,CCN(CC)CCC1=NC(=NO1)C1=CC=CC=C1,245.326,2.7,1.0,R,RESPIRATORY SYSTEM,13738.0
11890,InChI=1S/C187H291N45O59/c1-18-105(10)154(180(2...,67.0,57.0,CC[C@H](C)[C@H](NC(=O)[C@H](CC1=CC=CC=C1)NC(=O...,4113.641,-18.0,0.0,A,ALIMENTARY TRACT AND METABOLISM,
8085,InChI=1S/C24H29N7O2/c1-15-19-14-27-24(28-20-8-...,8.0,2.0,CC(=O)C1=C(C)C2=CN=C(NC3=NC=C(C=C3)N3CCNCC3)N=...,447.5328,2.12,1.0,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,5330286.0
11431,InChI=1S/C17H19N5O6S2/c1-2-3-8(9-6-30-16(18)20...,7.0,4.0,CC\C=C(/C(=O)N[C@H]1[C@H]2SCC(COC(N)=O)=C(N2C1...,453.49,0.52,1.0,J,ANTIINFECTIVES FOR SYSTEMIC USE,6436055.0
213,"InChI=1S/C36H47N5O4/c1-36(2,3)39-35(45)31-24-4...",7.0,4.0,CC(C)(C)NC(=O)[C@@H]1CN(CC2=CN=CC=C2)CCN1C[C@@...,613.7895,3.26,0.0,J,ANTIINFECTIVES FOR SYSTEMIC USE,5362440.0
406,InChI=1S/C10H21NO4/c1-2-3-4-11-5-8(13)10(15)9(...,5.0,4.0,CCCCN1C[C@H](O)[C@@H](O)[C@H](O)[C@H]1CO,219.278,-1.1,1.0,A,ALIMENTARY TRACT AND METABOLISM,51634.0
11178,InChI=1S/C20H27N5O3/c1-4-24(12-13-26)10-11-25-...,5.0,1.0,CCN(CCO)CCN1C(CC2=CC=CC=C2)=NC2=C1C(=O)N(C)C(=...,385.468,1.71,1.0,R,RESPIRATORY SYSTEM,16229.0
4137,"InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)",3.0,2.0,CC(O)C(O)=O,90.0779,-0.79,1.0,G,GENITO URINARY SYSTEM AND SEX HORMONES,612.0


### Functions

A series of functions below will obtain fingerprints from the RDKit Molecules or their CID (using Pubchempy)

In [19]:
def compute_connectivity_invariants(mol):
    """Function that obtains the connectivity invariants of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        con_inv_fp = rdMolDescriptors.GetConnectivityInvariants(mol)
    except:
        print('Something went wrong computing Connectivity Invariants')
        return None
    return np.array(con_inv_fp)

In [20]:
def compute_feature_invariants(mol):
    """Function that obtains the feature invariants of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        inv_fp = rdMolDescriptors.GetFeatureInvariants(mol)
    except:
        print('Something went wrong computing Feature Invariants')
        return None
    return np.array(inv_fp)

In [21]:
def compute_morgan_fp(mol, depth=2, nBits=2048):
    """Function that obtains the Morgan fingerprints of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        mor_fp = AllChem.GetMorganFingerprintAsBitVect(mol,depth,nBits)
    except:
        print('Something went wrong computing Morgan fingerprints')
        return None
    return np.array(mor_fp)

In [22]:
def compute_maccskeys(mol):
    """Function that obtains the MACCSKeys of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        mkeys = MACCSkeys.GenMACCSKeys(mol)   
    except:
        print('Something went wrong computing MACCSKeys')
        return None
    return np.array(mkeys)

In [23]:
def compute_atom_pair_fp(mol, nBits=2048):
    """Function that obtains the atom pair Fingerprints of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        atom_pair_fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits)
    except:
        print('Something went wrong computing Atom Pair fingerprints')
        return None
    return np.array(atom_pair_fp)

In [24]:
def compute_topological_torsion_fp(mol, nBits=2048):
    """Function that obtains the topological torsion fingerprints of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        tt_fp = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
    except:
        print('Something went wrong computing Topological Torsion fingerprints')
        return None
    return np.array(tt_fp)
    

In [25]:
def compute_avalon_fp(mol, nBits=2048):
    """Function that obtains the Avalon fingerprints of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        av_fp = pyAvalonTools.GetAvalonFP(mol, nBits)
    except:
        print('Something went wrong computing Avalon fingerprints')
        return None
    return np.array(av_fp)

In [26]:
def compute_rdkit_fp(mol, maxPath=5, fpSize=2048):
    """Function that obtains the RDKit fingerprints of a molecule
    Input: RDKit molecule
    Output: Numpy array
    """
    try:
        rdkit_fp = AllChem.RDKFingerprint(mol, maxPath, fpSize)
    except:
        print('Something went wrong computing RDKit fingerprints')
        return None
    return np.array(rdkit_fp)

In [27]:
def compute_pubchem_fingerprints(cid):
    """Function that obtains the PubChem fingerprints of a molecule
    Input: molecules's CID
    Output: Numpy array
    """
    try:
        comp = pcp.Compound.from_cid(int(cid))
        fp_bin = bin(int(comp.fingerprint, 16))[2:]   
    except:
        print('Something went wrong computing Pubchem fingerprints')
        return None
    return np.array(list(fp_bin)).astype('int')

In [28]:
def compute_cactvs_fingerprints(cid):
    """Function that obtains the Cactvs fingerprints of a molecule
    Input: molecule's CID
    Output: Numpy array
    """
    try:
        comp = pcp.Compound.from_cid(int(cid))
        cactvs_fp_bin = bin(int(comp.fingerprint, 16))[2:]
    except:
        print('Something went wrong computing Cactvs fingerprints')
        return None
    return np.array(list(cactvs_fp_bin)).astype('int')

### Build a dataframe of fingerprints

Add a column with the RDKit Molecule to the Dataframe

In [109]:
pt.AddMoleculeColumnToFrame(frame=clean_drug_bank, smilesCol='SMILES', molCol='Molecule')
clean_drug_bank.head()

[17:14:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:14:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:14:51] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:14:51] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[17:14:51] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'


Unnamed: 0,InChI,H Bond Acceptor Count,H Bond Donor Count,SMILES,Molecular Weight,logP,Rule of Five,atc_code_3,atc_code_3_$,CID,Molecule
5,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,37.0,28.0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,2180.2853,-0.76,0.0,B,BLOOD AND BLOOD FORMING ORGANS,101041682.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E44...
6,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,16.0,16.0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,1209.3983,1.04,0.0,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,657181.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E44...
13,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,18.0,17.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,1269.4105,0.3,0.0,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,5311128.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E44...
25,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,16.0,20.0,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,1811.253,4.38,0.0,R,RESPIRATORY SYSTEM,,<rdkit.Chem.rdchem.Mol object at 0x000001D9E44...
33,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,15.0,14.0,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,1069.22,-1.0,0.0,H,"SYSTEMIC HORMONAL PREPARATIONS, EXCL. SEX HORM...",5311065.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E44...


Select columns of interest

In [110]:
display(clean_drug_bank.sample(5))
drug_ids = clean_drug_bank[['Molecule','atc_code_3', 'CID']]
display(drug_ids.sample(5))

Unnamed: 0,InChI,H Bond Acceptor Count,H Bond Donor Count,SMILES,Molecular Weight,logP,Rule of Five,atc_code_3,atc_code_3_$,CID,Molecule
8021,"InChI=1S/C14H22ClNO/c1-11(10-16(3)4)14(2,17)9-...",2.0,1.0,CC(CN(C)C)C(C)(O)CC1=CC=C(Cl)C=C1,255.784,3.15,1.0,R,RESPIRATORY SYSTEM,26937.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...
7927,"InChI=1S/C32H31BrN2O2/c1-35(2)19-18-32(36,28-1...",4.0,1.0,COC1=NC2=C(C=C(Br)C=C2)C=C1[C@@H](C1=CC=CC=C1)...,555.505,6.37,0.0,J,ANTIINFECTIVES FOR SYSTEMIC USE,5388906.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...
1344,InChI=1S/C22H27NO2/c1-4-22(24)10-8-18-16-6-5-1...,2.0,1.0,[H][C@@]12CC[C@@](O)(C#C)[C@@]1(C)CC[C@@]1([H]...,337.4553,3.62,1.0,G,GENITO URINARY SYSTEM AND SEX HORMONES,28417.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...
11436,InChI=1S/C23H27NO/c1-24-18-12-13-19(24)15-20(1...,2.0,0.0,CN1[C@H]2CC[C@@H]1C[C@@H](C2)OC1C2=CC=CC=C2CCC...,333.475,5.05,1.0,R,RESPIRATORY SYSTEM,203911.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...
11495,"InChI=1S/C5H12O4/c6-1-5(2-7,3-8)4-9/h6-9H,1-4H2",4.0,4.0,OCC(CO)(CO)CO,136.147,-1.9,1.0,A,ALIMENTARY TRACT AND METABOLISM,8285.0,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...


Unnamed: 0,Molecule,atc_code_3,CID
143,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...,A,5280793.0
373,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...,L,454216.0
616,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...,L,119182.0
570,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...,A,4513.0
595,<rdkit.Chem.rdchem.Mol object at 0x000001D9E84...,C,5906.0


Encode de drug_class column, the codified column will be our label

In [2]:
le = preprocessing.LabelEncoder()
le = le.fit(drug_ids['atc_code_3'])
drug_ids['atc_code_#'] = le.transform(drug_ids['atc_code_3'])
display(drug_ids.head())


NameError: name 'preprocessing' is not defined

#### Using the functions described above to add columns containing the fingerprints

In [112]:
drug_ids['FeatInvariants'] = drug_ids['Molecule'].map(compute_feature_invariants)
drug_ids['ConnInvariants'] = drug_ids['Molecule'].map(compute_connectivity_invariants)
drug_ids['Morgan2FP'] = drug_ids['Molecule'].map(compute_morgan_fp)
drug_ids['MACCSKeys'] = drug_ids['Molecule'].map(compute_maccskeys)
drug_ids['AtomPairFP'] = drug_ids['Molecule'].map(compute_atom_pair_fp)
drug_ids['TopTorFP'] = drug_ids['Molecule'].map(compute_topological_torsion_fp)
drug_ids['AvalonFP'] = drug_ids['Molecule'].map(compute_avalon_fp)

Something went wrong computing Feature Invariants
Something went wrong computing Feature Invariants
Something went wrong computing Feature Invariants
Something went wrong computing Feature Invariants
Something went wrong computing Connectivity Invariants
Something went wrong computing Connectivity Invariants
Something went wrong computing Connectivity Invariants
Something went wrong computing Connectivity Invariants
Something went wrong computing Morgan fingerprints
Something went wrong computing Morgan fingerprints
Something went wrong computing Morgan fingerprints
Something went wrong computing Morgan fingerprints
Something went wrong computing MACCSKeys
Something went wrong computing MACCSKeys
Something went wrong computing MACCSKeys
Something went wrong computing MACCSKeys
Something went wrong computing Atom Pair fingerprints
Something went wrong computing Atom Pair fingerprints
Something went wrong computing Atom Pair fingerprints
Something went wrong computing Atom Pair fingerpri

In [113]:
# This mappings might take very long
drug_ids['PubchemFP']= drug_ids['CID'].map(compute_pubchem_fingerprints) #This takes over 1 hour in my computer
drug_ids['CactvsFP']= drug_ids['CID'].map(compute_cactvs_fingerprints) #This takes over 1 hour in my computer
#drug_ids['RDKitFP']= drug_ids['Molecule'].map(compute_rdkit_fp) #This takes so long that crashes my computer, but I coudn't find a way around

Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something went wrong computing Pubchem fingerprints
Something we

### Saving the dataframe

In [114]:
drug_ids.to_pickle(os.path.join('res','pickles','drugbank_fp.pkl'))

In [4]:
drug_ids.sample(5)

NameError: name 'drug_ids' is not defined

In [116]:
drug_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2938 entries, 5 to 14995
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Molecule        2934 non-null   object 
 1   atc_code_3      2938 non-null   object 
 2   CID             2900 non-null   float64
 3   atc_code_#      2938 non-null   int32  
 4   FeatInvariants  2934 non-null   object 
 5   ConnInvariants  2934 non-null   object 
 6   Morgan2FP       2934 non-null   object 
 7   MACCSKeys       2934 non-null   object 
 8   AtomPairFP      2934 non-null   object 
 9   TopTorFP        2934 non-null   object 
 10  AvalonFP        2934 non-null   object 
 11  PubchemFP       2900 non-null   object 
 12  CactvsFP        2900 non-null   object 
dtypes: float64(1), int32(1), object(11)
memory usage: 309.9+ KB


In [1]:
n = pd.read_pickle(os.path.join('res', 'pickles', 'drug'))

NameError: name 'drug_ids' is not defined