In [38]:
import pandas as pd
import numpy as np
import time
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [39]:
def get_mols(smiles):
        try:
            mol = Chem.MolFromSmiles(smiles)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',smiles,'\n')
            mol = None
        return mol

In [40]:
nms=[x[0] for x in Descriptors._descList]
print(len(nms))
mol = Chem.MolFromSmiles('CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21') #example
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
type(calc.CalcDescriptors(mol))

208


tuple

In [41]:
def get_descriptors(m):
        try:
            calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
            descrs = calc.CalcDescriptors(m)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',m,'\n')
            descrs = None
        return descrs

In [42]:
def process(df,label):
    begin = time.time()
    smiles_list = pd.Series(list(df['Canonical_SMILES']))
    descrs_list = list()
    df['Mol_Objects'] = smiles_list.map(get_mols)
    mol_list = pd.Series(list(df['Mol_Objects']))
    for mol in mol_list:
        descrs_list.append(get_descriptors(mol))
    df_temp = pd.DataFrame([x for x in descrs_list])
    df = pd.concat([df,df_temp],axis=1)
    end = time.time()
    elapsed = end - begin
    print(elapsed)
    return df

In [43]:
df = pd.read_csv('./DILI/final/DILI_Final.csv')
df = process(df,'DILI')
df_a=df[['ID','Canonical_SMILES','DILI']]
df_b=df.loc[:,0:207]
recolumn = []
for i in range(208):
    recolumn.append(f'{nms[i]}')
df_b.columns = recolumn
df = pd.concat([df_a,df_b],axis=1)
df

56.619192361831665


Unnamed: 0,ID,Canonical_SMILES,DILI,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,6.193380,0.092361,6.193380,0.092361,0.848995,255.361,234.193,...,0,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,6.482112,-0.455333,6.482112,0.455333,0.722715,343.898,317.690,...,0,0,0,0,0,0,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,6.271931,0.815501,6.271931,0.815501,0.817874,314.860,291.676,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,6.176540,0.706466,6.176540,0.706466,0.800122,326.831,307.679,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,13.195604,-0.817283,13.195604,0.137241,0.622271,426.560,396.320,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,14.188117,-2.026666,14.188117,0.171241,0.433136,583.689,546.393,...,0,0,0,0,0,0,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,9.480602,-0.669907,9.480602,0.180178,0.510157,183.207,170.103,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,10.324075,-0.664234,10.324075,0.331507,0.290517,282.468,248.196,...,0,0,0,0,0,0,0,0,10,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,5.273144,0.205596,5.273144,0.205596,0.513143,198.314,176.138,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df.to_csv('./DILI/final/DILI_DESCRS.csv')

In [45]:
mfps = pd.read_csv('./DILI/final/DILI_MFPS.csv')
descrs = pd.read_csv('./DILI/final/DILI_DESCRS.csv')
df_final = pd.merge(mfps,descrs)
df_final = df_final.loc[:,"ID":'fr_urea']
df_final

Unnamed: 0,ID,Canonical_SMILES,DILI,MFPS_1,MFPS_2,MFPS_3,MFPS_4,MFPS_5,MFPS_6,MFPS_7,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
df_final.to_csv('./DILI/final/DILI_Features.csv')