In [1]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator
from mordred import Calculator, descriptors
import numpy as np

In [2]:
data = Chem.SDMolSupplier("data/chin-qspr-dataset.sdf")

In [3]:
list(data[0].GetPropNames())

['pLC50', 'compound_id']

RDKit descriptor generation - only ~40 available

In [43]:
available_descriptors = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(available_descriptors)
data_descriptors = [list(get_descriptors.ComputeProperties(cur_molecule)) for cur_molecule in data]



#### Filter out molecules duplicates

In [25]:
cannonical_smiles = np.array([Chem.rdmolfiles.MolToSmiles(cur_mol) for cur_mol in data])



In [36]:
smiles_counts = np.unique(cannonical_smiles, return_counts=True)
duplicate_smiles = smiles_counts[0][np.where(smiles_counts[1] > 1)]

In [38]:
rows_to_remove = np.where([cur_smiles in duplicate_smiles for cur_smiles in cannonical_smiles])

#### Generate nice pd dataframe

In [44]:
compound_ids = [cur_mol.GetProp("compound_id") for cur_mol in data]
compound_toxicity = [cur_mol.GetProp("pLC50") for cur_mol in data]



In [50]:
pd_data = pd.DataFrame(data = data_descriptors, columns=available_descriptors)
pd_data.insert(0, "compound_id", compound_ids)
pd_data.insert(pd_data.shape[1], "compound_toxicity", compound_toxicity)
pd_data.drop(rows_to_remove[0], inplace=True)
pd_data.to_csv("molecules_with_descriptors.csv")

Update to mordred - 1600+ descriptors

In [53]:
calc = Calculator(descriptors, ignore_3D=True)
df = calc.pandas(data)
print(df.shape)
# reduce to features without errors
df = df.select_dtypes(include="number")
print(df.shape)
# throw out columns with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]
print(df.shape)
df.insert(0, "compound_id", compound_ids)
df.insert(df.shape[1], "compound_toxicity", compound_toxicity)
df.drop(rows_to_remove[0], inplace = True)
print(df.shape)
df.to_csv("molecules_descriptors_mordred.csv")

 77%|███████▋  | 290/375 [00:09<00:01, 45.79it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 375/375 [00:12<00:00, 31.13it/s]


(375, 1613)
(375, 915)
(375, 732)
(366, 734)


### Generating some fingerprint

In [55]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=5,fpSize=4096)

In [56]:
mol_fp = [list(mfpgen.GetFingerprint(cur_mol)) for cur_mol in data]



In [57]:
max([sum(cur_fp) for cur_fp in mol_fp])

106

In [58]:
fp_dataframe = pd.DataFrame(mol_fp)

In [59]:
# throw out columns with constant values
fp_dataframe = fp_dataframe.loc[:,fp_dataframe.apply(pd.Series.nunique) != 1]
print(fp_dataframe.shape)
fp_dataframe.insert(0, "compound_id", compound_ids)
fp_dataframe.insert(fp_dataframe.shape[1], "compound_toxicity", compound_toxicity)
fp_dataframe.drop(rows_to_remove[0], inplace=True)
fp_dataframe.to_csv("fp_dataframe.csv")

(375, 2208)
