In [139]:
import pickle
import pandas as pd

from rdkit.Chem import MolFromSmiles
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import numpy as np
import pandas as pd
from rdkit.Chem.rdmolops import RDKFingerprint
from functools import partial
from collections import defaultdict
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray

# check if this is the standard definitoin of the fdef from RDKit
fdefstr = '''
AtomType NDonor [N&!H0&v3,N&!H0&+1&v4,n&H1&+0]
AtomType ChalcDonor [O,S;H1;+0]
DefineFeature SingleAtomDonor [{NDonor},{ChalcDonor},!$([D1]-[C;D3]=[O,S,N])]
  Family Donor
  Weights 1
EndFeature

AtomType NAcceptor [$([N&v3;H1,H2]-[!$(*=[O,N,P,S])])]
Atomtype NAcceptor [$([N;v3;H0])]
AtomType NAcceptor [$([n;+0])]
AtomType ChalcAcceptor [$([O,S;H1;v2]-[!$(*=[O,N,P,S])])]
AtomType ChalcAcceptor [O,S;H0;v2]
Atomtype ChalcAcceptor [O,S;-]
Atomtype ChalcAcceptor [o,s;+0]
AtomType HalogenAcceptor [F]
DefineFeature SingleAtomAcceptor [{NAcceptor},{ChalcAcceptor},{HalogenAcceptor}]
  Family Acceptor
  Weights 1
EndFeature

# this one is delightfully easy:
DefineFeature AcidicGroup [C,S](=[O,S,P])-[O;H1,H0&-1]
  Family NegIonizable
  Weights 1.0,1.0,1.0
EndFeature

AtomType CarbonOrArom_NonCarbonyl [$([C,a]);!$([C,a](=O))]
AtomType BasicNH2 [$([N;H2&+0][{CarbonOrArom_NonCarbonyl}])]
AtomType BasicNH1 [$([N;H1&+0]([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
AtomType BasicNH0 [$([N;H0&+0]([{CarbonOrArom_NonCarbonyl}])([{CarbonOrArom_NonCarbonyl}])[{CarbonOrArom_NonCarbonyl}])]
AtomType BasicNakedN [N,n;X2;+0]
DefineFeature BasicGroup [{BasicNH2},{BasicNH1},{BasicNH0},{BasicNakedN}]
  Family PosIonizable
  Weights 1.0
EndFeature

# aromatic rings of various sizes:
DefineFeature Arom5 a1aaaa1
  Family Aromatic
  Weights 1.0,1.0,1.0,1.0,1.0
EndFeature
DefineFeature Arom6 a1aaaaa1
  Family Aromatic
  Weights 1.0,1.0,1.0,1.0,1.0,1.0
EndFeature
DefineFeature Arom7 a1aaaaaa1
  Family Aromatic
  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0
EndFeature
DefineFeature Arom8 a1aaaaaaa1
  Family Aromatic
  Weights 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
EndFeature
'''

with open('/tf/notebooks/code_for_pub/smiles_files/smiles_drugcomb.pickle', 'rb') as f:
    sms = pickle.load(f)

In [145]:
def generate_FPs(smiles, radius=2, length=300, name='morgan', fdef=fdefstr):
    '''
    smiles df should contain smiles column and cid column
    returns morganFP
    '''
    from rdkit.Chem import MolFromSmiles
    if name == 'morgan':
        from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
        f = partial(GetMorganFingerprintAsBitVect, radius=radius, nBits=length)
    
    elif name == 'pharma2d':
        from rdkit.Chem import ChemicalFeatures
        from rdkit.Chem.Pharm2D import Generate
        from rdkit.Chem.Pharm2D.SigFactory import SigFactory
        
        featFactory = ChemicalFeatures.BuildFeatureFactoryFromString(fdefstr)
        sigFactory_2point = SigFactory(featFactory, minPointCount=2, maxPointCount=3, trianglePruneBins=False)
        sigFactory_2point.SetBins([(0, 2), (2, 5), (5, 8)])
        sigFactory_2point.Init()
        f = partial(Generate.Gen2DFingerprint, sigFactory=sigFactory_2point)
        length = sigFactory_2point.GetSigSize()

    else: 
        f = partial(RDKFingerprint, fpSize = length)
        
    holder = np.zeros((len(smiles), length), dtype=np.uint8)
    for e,i in enumerate(smiles.iteritems()):
        a = MolFromSmiles(i[1])
        #holder[e] = np.asarray(GetMorganFingerprintAsBitVect(a, radius=radius, nBits=length), dtype=np.uint8)
        holder[e] = np.asarray(f(a), dtype = np.uint8 )
        
    return holder

In [151]:
generate_FPs(sms.iloc[:2], name='pharma2d').shape

(2, 990)

In [33]:
smiles = pd.DataFrame(sms)

fps_topo = generate_FPs(sms)
df = pd.DataFrame(fps_topo, index = sms.index)
fin = pd.concat((smiles, df), axis=1)
fin.shape

(4153, 301)

In [11]:
#fin.to_csv('/tf/notebooks/code_for_pub/smiles_file/fps_morga.csv', header=True, index=True)

In [11]:
#fin.to_csv('/tf/notebooks/code_for_pub/fp_files/fps_topo.csv', header=True, index=True)

In [25]:
fin.drop(columns=['smiles'], inplace=True)

In [27]:
with open('/tf/notebooks/code_for_pub/fp_files/fps_topo_300bit.pickle','wb') as f:
    pickle.dump(fin, f)

In [20]:
with open('/tf/notebooks/code_for_pub/smiles_files/smiles_drugcomb.pickle','rb') as f:
    test = pickle.load(f)

In [None]:
generate_FPs(sms.iloc[:2], name='pharma2d').shape

smiles = pd.DataFrame(sms)

fps_pharma2D = generate_FPs(sms, name='pharma2d')
df_pharma = pd.DataFrame(fps_pharma2D, index = sms.index)
fin_pharma = pd.concat((smiles, df_pharma), axis=1)
print(fin_pharma.shape)

In [158]:
fin_pharma.drop(columns=['smiles'], inplace=True)

In [162]:
fin_pharma.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,980,981,982,983,984,985,986,987,988,989
1,0,1,1,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,1,0,0,1,0
4,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
with open('/tf/notebooks/code_for_pub/fp_files/fps_pharma2D_990bit.pickle','wb') as f:
    pickle.dump(fin_pharma, f)