In [1]:
%ls ../data

ESOL_README                          esol_original_extra.csv
FreeSolv_README                      freesolv_original.csv
Lipo_README                          freesolv_original_IdSmileTarget.csv
esol_original.csv                    lipophilicity_original.csv
esol_original_IdSmileTarget.csv


In [2]:
import pandas as pd

In [3]:
dataset = 'esol'

In [4]:
df = pd.read_csv(f'../data/{dataset}_original.csv')

In [5]:
df.shape

(1128, 10)

In [6]:
df.head()

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


# Save [id, smile, target] from the dataset to a csv

In [7]:
subset_df = df[['Compound ID', 'smiles', 'measured log solubility in mols per litre']]
subset_df.head()

Unnamed: 0,Compound ID,smiles,measured log solubility in mols per litre
0,Amigdalin,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Fenfuram,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,citral,CC(C)=CCCC(C)=CC(=O),-2.06
3,Picene,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,Thiophene,c1ccsc1,-1.33


In [8]:
columns_mapper = {'Compound ID': 'id',
                  'smiles': 'smile',
                  'measured log solubility in mols per litre': 'target'
                 }

In [9]:
ready_df = subset_df.rename(columns=columns_mapper)
ready_df.head()

Unnamed: 0,id,smile,target
0,Amigdalin,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Fenfuram,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,citral,CC(C)=CCCC(C)=CC(=O),-2.06
3,Picene,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,Thiophene,c1ccsc1,-1.33


In [10]:
# save file
ready_df.to_csv(f'../data/{dataset}_original_IdSmileTarget.csv', index=False)

In [11]:
ready_df.shape

(1128, 3)

In [12]:
ready_df.columns

Index(['id', 'smile', 'target'], dtype='object')

# Save extra features present in the dataset

In [13]:
original_feature_names = df.columns
original_feature_names

Index(['Compound ID', 'ESOL predicted log solubility in mols per litre',
       'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
       'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area',
       'measured log solubility in mols per litre', 'smiles'],
      dtype='object')

In [14]:
extra_features = df[['Compound ID', 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
       'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area']]

ready_extra_features = extra_features.rename(columns={'Compound ID':'id'})
ready_extra_features.head()

Unnamed: 0,id,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area
0,Amigdalin,1,457.432,7,3,7,202.32
1,Fenfuram,1,201.225,1,2,2,42.24
2,citral,1,152.237,0,0,4,17.07
3,Picene,2,278.354,0,5,0,0.0
4,Thiophene,2,84.143,0,1,0,0.0


In [15]:
ready_extra_features.to_csv(f'../data/{dataset}_original_extra.csv', index=False)

# Check

In [16]:
%ls

esol_get_original_id_smile_target.ipynb
freeesolv_get_original_id_smile_target.ipynb
get_fingerprints.ipynb
get_fingerprints_local.ipynb
get_protenated_from_canonical.ipynb
rf_grid_search_on_fingerprints.ipynb


In [17]:
%ls ../data/

ESOL_README                          esol_original_extra.csv
FreeSolv_README                      freesolv_original.csv
Lipo_README                          freesolv_original_IdSmileTarget.csv
esol_original.csv                    lipophilicity_original.csv
esol_original_IdSmileTarget.csv


In [18]:
import sys, os
present = set(os.listdir('../data/'))

present

{'.DS_Store',
 '.ipynb_checkpoints',
 'ESOL_README',
 'FreeSolv_README',
 'Lipo_README',
 'esol_original.csv',
 'esol_original_IdSmileTarget.csv',
 'esol_original_extra.csv',
 'freesolv_original.csv',
 'freesolv_original_IdSmileTarget.csv',
 'lipophilicity_original.csv'}

In [19]:
original_files = set(['esol_original.csv', 'freesolv_original.csv', 'lipophilicity_original.csv'])
original_files

{'esol_original.csv', 'freesolv_original.csv', 'lipophilicity_original.csv'}

In [20]:
# check that we have not deleted original files
assert original_files.issubset(present)

In [21]:
# check that we produced the needed file(s)
assert f'{dataset}_original_IdSmileTarget.csv' in present
assert f'{dataset}_original_extra.csv' in present