In [16]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from mordred import Calculator, descriptors
import numpy as np

In [2]:
data = Chem.SDMolSupplier("data/chin-qspr-dataset.sdf")

In [3]:
list(data[0].GetPropNames())

['pLC50', 'compound_id']

RDKit descriptor generation - only ~40 available

In [24]:
available_descriptors = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(available_descriptors)
data_descriptors = [list(get_descriptors.ComputeProperties(cur_molecule)) for cur_molecule in data]



#### Generate nice pd dataframe

In [12]:
compound_ids = [cur_mol.GetProp("compound_id") for cur_mol in data]
compound_toxicity = [cur_mol.GetProp("pLC50") for cur_mol in data]



In [10]:
pd_data = pd.DataFrame(data = data_descriptors, columns=available_descriptors)
pd_data.insert(0, "compound_id", compound_ids)
pd_data.insert(pd_data.shape[1], "compound_toxicity", compound_toxicity)
pd_data.to_csv("molecules_with_descriptors.csv")



NameError: name 'data_descriptors' is not defined

Update to mordred - 1600+ descriptors

In [29]:
calc = Calculator(descriptors, ignore_3D=True)
df = calc.pandas(data)
print(df.shape)
# reduce to features without errors
df = df.select_dtypes(include="number")
print(df.shape)
# throw out columns with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]
print(df.shape)
df.insert(0, "compound_id", compound_ids)
df.insert(df.shape[1], "compound_toxicity", compound_toxicity)
df.to_csv("molecules_descriptors_mordred.csv")

 78%|███████▊  | 291/375 [00:09<00:02, 36.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 375/375 [00:12<00:00, 31.12it/s]


(375, 1613)
(375, 915)
(375, 732)


In [28]:
df

Unnamed: 0,compound_id,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nBridgehead,nHetero,nH,...,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,compound_toxicity
0,compound-1,0,0,0,0,14,6,0,2,8,...,29.439488,88.052429,6.289459,32,3,20.0,18.0,3.611111,1.583333,2.583
1,compound-2,0,0,6,6,16,9,0,2,7,...,37.289972,121.052764,7.565798,88,9,40.0,43.0,3.472222,2.111111,2.263
2,compound-3,0,0,0,0,28,10,0,1,18,...,36.593370,142.135765,5.076277,149,8,36.0,35.0,4.611111,2.666667,3.662
3,compound-4,0,0,6,6,21,11,0,2,10,...,40.148993,150.068080,7.146099,170,12,48.0,52.0,3.972222,2.750000,3.728
4,compound-5,0,0,0,0,23,9,0,1,14,...,34.994205,126.104465,5.482803,110,7,32.0,31.0,4.361111,2.416667,5.485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,compound-371,0,0,0,0,11,6,0,3,5,...,29.753427,145.945683,13.267789,31,4,20.0,19.0,3.611111,1.666667,3.408
371,compound-372,0,0,0,0,14,6,0,2,8,...,28.105124,126.000306,9.000022,35,3,18.0,16.0,3.000000,1.750000,3.391
372,compound-373,0,0,6,6,22,12,0,3,10,...,42.029522,166.062994,7.548318,202,15,54.0,60.0,4.833333,2.944444,3.278
373,compound-374,0,0,0,0,9,6,0,4,3,...,31.665095,100.013599,11.112622,28,3,24.0,22.0,4.312500,1.375000,2.924
