In [1]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra_features.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
freesolv_original_ecfp4_features.csv
freesolv_original_ecfp6_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
lipophilicity_original_ecfp4_features.csv
lipophilicity_original_ecfp6_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi
lipophilicity_protonated_smiles20_messy.txt


# Import modules

In [2]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit import RDLogger
from rdkit.Chem import Descriptors

# Load Data

In [3]:
dataset = 'freesolv'
smile_type = 'original'

In [4]:
data = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmileTarget.csv', index_col=0)
print(data.head(), '\n')
data.shape

                                                   smile  target
id                                                              
4-methoxy-N,N-dimethyl-benzamide  CN(C)C(=O)c1ccc(cc1)OC  -11.01
methanesulfonyl chloride                    CS(=O)(=O)Cl   -4.87
3-methylbut-1-ene                               CC(C)C=C    1.83
2-ethylpyrazine                               CCc1cnccn1   -5.45
heptan-1-ol                                     CCCCCCCO   -4.21 



(642, 2)

In [5]:
smiles = data['smile']
print(len(smiles))

smiles[:5]

642


id
4-methoxy-N,N-dimethyl-benzamide    CN(C)C(=O)c1ccc(cc1)OC
methanesulfonyl chloride                      CS(=O)(=O)Cl
3-methylbut-1-ene                                 CC(C)C=C
2-ethylpyrazine                                 CCc1cnccn1
heptan-1-ol                                       CCCCCCCO
Name: smile, dtype: object

# Get RDKit Molecular descriptors

In [6]:
# load ligands and compute features
features = {}
descriptors = {d[0]: d[1] for d in Descriptors.descList}

for index in smiles.index:
    
    mol = Chem.MolFromSmiles(smiles.loc[index])
    
    # how exactly do we add hydrogens here???
    mol = Chem.AddHs(mol)
    
    try:
        features[index] = {d: descriptors[d](mol) for d in descriptors}
    except ValueError as e:
        print(e)
        continue
    
features = pd.DataFrame.from_dict(features).T
features.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
"4-methoxy-N,N-dimethyl-benzamide",12.42817,-3.458874,12.42817,0.519264,0.68636,179.219,166.115,179.094629,70.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
methanesulfonyl chloride,9.85571,-4.368056,9.85571,3.074846,0.421427,114.553,111.529,113.954228,32.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-methylbut-1-ene,7.349537,-3.289005,7.349537,1.280324,0.412737,70.135,60.055,70.07825,30.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2-ethylpyrazine,7.363796,-3.006484,7.363796,0.587878,0.536795,108.144,100.08,108.068748,42.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
heptan-1-ol,7.58125,-4.173333,7.58125,3.423878,0.544191,116.204,100.076,116.120115,50.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# save file
features.to_csv(f'../data/{dataset}_{smile_type}_rdkit_features.csv', index=True)