In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools as pt
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs
from rdkit import rdBase
import pubchempy as pcp
import pandas as pd
import sklearn
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [6]:
drugs = pd.read_csv('CID_properties_nr.csv')
pt.AddMoleculeColumnToFrame(frame=drugs,smilesCol='IsomericSMILES', molCol='Molecule')
drugs.head()



Unnamed: 0,CID,HBondAcceptorCount,HBondDonorCount,IsomericSMILES,MolecularWeight,XLogP,drug_class,desalted_SMILES,Molecule
0,24769,2,0,CN(C)CCCCCCN(C)C.C(CBr)CBr,374.205,,hematologic,BrCCCBr.CN(C)CCCCCCN(C)C,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...
1,134694070,9,6,C1CN=C(N1)NC2=C(C3=NC=CN=C3C=C2)Br.[C@@H](C(C(...,442.226,,cardio,Brc1c(NC2=NCCN2)ccc2nccnc12,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...
2,5121,2,0,C1CSC2=NC(CN21)C3=CC=C(C=C3)Br,283.187,2.5,antiinfective,Brc1ccc(C2CN3CCSC3=N2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...
3,4660557,1,1,C1C2CC3CC1CC(C2)C3NC4=CC=C(C=C4)Br,306.247,5.0,cns,Brc1ccc(NC2C3CC4CC(C3)CC2C4)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...
4,122175,2,2,CC(CCC(C#C)N)N,126.203,-0.4,antineoplastic,C#CC(N)CCC(C)N,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...


In [11]:
first_mol = drugs.iloc[3]['Molecule']
first_mol_cid = drugs.iloc[3]['CID']
int(first_mol_cid)

4660557

In [48]:
""" 
GetFeatureInvariants,
GetConnectivityInvariants
"""

'                                          GetAtomPairFingerprint,\n                                         GetTopologicalTorsionFingerprint,\n                                         GetMACCSKeysFingerprint,\n                                         GetFeatureInvariants,\n                                         GetConnectivityInvariants '

In [24]:
def compute_morgan_fp(mol, depth=2, nBits=2048):
    try:
        mor_fp = AllChem.GetMorganFingerprintAsBitVect(mol,depth,nBits)
    except:
        print('Something went wrong computing Morgan fingerprints')
        return None
    return np.array(mor_fp)

In [25]:
def compute_maccskeys(mol):
    try:
        mkeys = MACCSkeys.GenMACCSKeys(mol)   
    except:
        print('Something went wrong computing MACCSKeys')
        return None
    return np.array(mkeys)

In [26]:
def compute_atom_pair_fp(mol, nBits=2048):
    try:
        atom_pair_fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(first_mol, nBits)
    except:
        print('Something went wrong computing Atom Pair fingerprints')
        return None
    return np.array(atom_pair_fp)

In [27]:
def compute_topological_torsion_fp(mol, nBits=2048):
    try:
        tt_fp = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
    except:
        print('Something went wrong computing Topological Torsion fingerprints')
        return None
    return np.array(tt_fp)
    

In [28]:
def compute_morgan_circular_fp(mol, depth=2, nBits=2048):
    try:
        mc_fp = AllChem.GetMorganFingerprintAsBitVect(mol, depth, nBits)
    except:
        print('Something went wrong computin Morgan circular fingerprints')
        return None
    return np.array(mc_fp)

In [29]:
def compute_rdkit_fp(mol, maxPath=5, fpSize=2048):
    try:
        rdkit_fp = AllChem.RDKFingerprint(mol, maxPath, fpSize)
    except:
        print('Something went wrong computin Morgan circular fingerprints')
        return None
    return np.array(rdkit_fp)

In [30]:
def compute_pubchem_fingerprints(cid):
    comp = pcp.get_compounds(cid, 'cid')
    return np.array(comp.fingerprint)

In [31]:
def compute_pubchem_fingerprints(cid):
    comp = pcp.Compound.from_cid(int(cid))
    fp_bin = bin(int(comp.fingerprint, 16))[2:]
    return np.array(list(fp_bin)).astype('int')

In [32]:
def compute_cactvs_fingerprints(cid):
    comp = pcp.Compound.from_cid(int(cid))
    cactvs_fp_bin = bin(int(comp.fingerprint, 16))[2:]
    return np.array(list(cactvs_fp_bin)).astype('int')

In [33]:
compute_pubchem_fingerprints(first_mol_cid)

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,

In [34]:
drug_ids = drugs[['CID','Molecule','drug_class']]
drug_ids.head()

Unnamed: 0,CID,Molecule,drug_class
0,24769,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...,hematologic
1,134694070,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...,cardio
2,5121,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...,antiinfective
3,4660557,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...,cns
4,122175,<rdkit.Chem.rdchem.Mol object at 0x000002576FF...,antineoplastic


In [35]:
le = preprocessing.LabelEncoder()
le = le.fit(drug_ids['drug_class'])
drug_ids['drug_class_code'] = le.transform(drug_ids['drug_class'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_ids['drug_class_code'] = le.transform(drug_ids['drug_class'])


In [36]:
drug_ids['Morgan2FP'] = drug_ids['Molecule'].map(compute_morgan_fp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_ids['Morgan2FP'] = drug_ids['Molecule'].map(compute_morgan_fp)


In [37]:
drug_ids['MACCSKeys'] = drug_ids['Molecule'].map(compute_maccskeys)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_ids['MACCSKeys'] = drug_ids['Molecule'].map(compute_maccskeys)


In [38]:
drug_ids['AtomPairFP'] = drug_ids['Molecule'].map(compute_atom_pair_fp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_ids['AtomPairFP'] = drug_ids['Molecule'].map(compute_atom_pair_fp)


In [39]:
drug_ids['TopTorFP'] = drug_ids['Molecule'].map(compute_topological_torsion_fp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drug_ids['TopTorFP'] = drug_ids['Molecule'].map(compute_topological_torsion_fp)


In [40]:
drug_ids['MorganCircFP'] = drug_ids['Molecule'].map(compute_morgan_circular_fp)

In [44]:
drug_ids['PubchemFP']= drug_ids['CID'].map(compute_pubchem_fingerprints)

In [None]:
drug_ids['CactvsFP']= drug_ids['CID'].map(compute_cactvs_fingerprints)

In [None]:
drug_ids['RDKitFP']= drug_ids['CID'].map(compute_rdkit_fp)

0    <rdkit.Chem.rdchem.Mol object at 0x000002576FF...
1    <rdkit.Chem.rdchem.Mol object at 0x000002576FF...
2    <rdkit.Chem.rdchem.Mol object at 0x000002576FF...
3    <rdkit.Chem.rdchem.Mol object at 0x000002576FF...
4    <rdkit.Chem.rdchem.Mol object at 0x000002576FF...
Name: Molecule, dtype: object

In [None]:
drug_ids.sort_values('drug_class_code')

Unnamed: 0,CID,Molecule,drug_class,Morgan2FP,MACCSKeys,AtomPairFP,TopTorFP,MorganCircFP,drug_class_code
0,24769,<rdkit.Chem.rdchem.Mol object at 0x0000029A5DB...,hematologic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7
1,134694070,<rdkit.Chem.rdchem.Mol object at 0x0000029A702...,cardio,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3
2,5121,<rdkit.Chem.rdchem.Mol object at 0x0000029A702...,antiinfective,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,4660557,<rdkit.Chem.rdchem.Mol object at 0x0000029A702...,cns,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4
4,122175,<rdkit.Chem.rdchem.Mol object at 0x0000029A702...,antineoplastic,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2


In [142]:
drug_ids.to_pickle('morgan_and_mac.pkl')