In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

## Parameters
a number of parameters are used to control the data preparation, definitions are included in the respective cells

#### specifying data set features

In [2]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True
#AllData = False

In [3]:
# True: add example descriptor data to the data set
# False: don't use example descriptor data

use_example = True
#use_example = False

#### radius of morgan finger prints
the radius of the morgan finger prints is a parameter and can be set to integer values. Different values will affect the model.

In [4]:
mfp_r = 1

#### test run

In [5]:
# True: use the first 10 rows of the data
# False: use the entire data set

test_run = False
#test_run = True

#### removing columns with unique values
warning: deleting columns with identical values had a negative effect on the prediction score for some reason

In [6]:
# True: delete all columns with unique values
# False: keep the data set without removing any columns

delun = False
#delun = True

#### save as csv
data will only be saved if 'save' is set to true AND 'test_run' is set to false

In [7]:
# True: save training and testing data sets as csv files

save = True
#save = False

## Function collection

#### Renaming function
renames columns of a data set by adding a prefix 'new_name' in front of it

In [8]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer
standardizes molecules by removing counterions and charges

In [9]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### MorganFP generator
generates morgan finger prints with a certain radius 'mfp_r' to set in the Parameter section

In [10]:
def MFP_generator(mol):
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=mfp_r, nBits=4096))

#### Descriptor calculator
calculates rdkit descriptors of a molecule

In [11]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

#### Descriptor name generator

In [12]:
def desc_name_generator():
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

## 0. Reading data files from csv

#### reading training data set

In [13]:
df_train = pd.read_csv("Data/train.csv")

#### reading example predictor data set

In [14]:
example = pd.read_csv("Data/example_predictors.csv")
example.drop(columns='smiles', inplace=True)

df_example = rename(example, 'example_')

df_example.rename(columns={df_example.columns[0] : 'Id'}, inplace=True)

#### reading test data set

In [15]:
df_test = pd.read_csv("Data/test.csv")

# dummy value for the solubility category
df_test["sol_category"] = 5

## 1. Combining train and test data sets
combines training and testing set to simplify the feature generation. The 'apppend' command might need to be replaced for further versions of pandas since the method is deprecated

In [16]:
# partition (top 10)
if test_run:
    df_comb = pd.DataFrame(df_train.append(df_test))
    df_combined = df_comb.head(10)

# full data set
else:
    df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_combined = pd.DataFrame(df_train.append(df_test))


## 2. Canonicalizing smiles from molecules
rdkit MolToSmiles creates canonical smiles from a molecules

In [17]:
# creating molecules from smiles
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# creating canonical smiles from molecules
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

## 3. Standardising molecules

In [18]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

## 4. Extraction of MorganFP and rdkit descriptors

In [19]:
df_combined_copy = df_combined.copy(deep=True)

#### generating MorganFP

In [20]:
df_combined_copy["FP"] = df_combined_copy["molecule"].apply(MFP_generator)
#df_combined_copy

#### reformating morgan finger print bits into seperate columns

In [21]:
# generate a Numpy array of sample size and fps
fp_arr = np.stack(df_combined_copy["FP"])

#convert to df
df_fp = pd.DataFrame(fp_arr)

#### Descriptors
descriptors are only calculated if the parameter 'AllData' is set to true to save time if 'AllData' is set to false

In [22]:
if AllData:
    df_combined_copy["desc"] = df_combined_copy["molecule"].apply(desc_generator)
    
    # generate a Numpy array of sample size and fps
    desc_arr = np.stack(df_combined_copy["desc"])

    #convert to df
    df_desc = pd.DataFrame(desc_arr)

#### resetting index and labeling, optional : caculating descriptors

In [23]:
# adding MorganFP
df_fp = df_fp.reset_index(drop=True)
df_fp = rename(df_fp, 'ecfp_')

# adding rdkit descriptors
if AllData:
    df_desc = df_desc.reset_index(drop=True)
    df_desc.columns=desc_name_generator()
    df_desc = rename(df_desc, 'rdkit_desc_')
#df_desc

## 5. Combining data sets
adds feature data sets to the combined training and testing data set, the composition depends on the parameters 'use_example' and 'AllData'

In [24]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, df_fp], axis=1)

# Adding example descriptors
if use_example:
    #df_combined = pd.concat([df_combined, df_example], axis=1)
    df_combined = pd.merge(df_combined, df_example, on='Id', how='inner')

# complete data set by adding rdkit descriptors
if AllData:
    df_AllData = pd.concat([df_combined, df_desc], axis=1)
    df_AllData.drop(columns=['index','molecule'], inplace=True)

#### delete columns with unique values
columns with only identical values are deleted if the parameter 'delun' is set to true,
this had a slight negative effect on the prediction score for some reason

In [25]:
if delun:
    if AllData:
        len_un = len(df_AllData.columns)
        df_AllData = df_AllData.loc[:,df_AllData.nunique()!=1]
        num_drop_unique = len_un - len(df_AllData.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')
    else:
        len_un = len(df_combined.columns)
        df_combined = df_combined.loc[:,df_combined.nunique()!=1]
        num_drop_unique = len_un - len(df_combined.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')

## 6. Separating train and test data sets on dummy value
the main use of the dummy value for the testing set is to allow the seperation of the data sets according to the 'sol_category' value. Since the training set only has values of 0, 1 and 2, every molecule with values smaller then the dummy value 5 is part of the training set while every molecule with a value of 5 is part of the testing set.

In [26]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_set.drop(columns='sol_category', inplace=True)


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 7. Saving data sets as csv files
the training and testing data set will be saved as csv files for 'save' set to true and 'test_run' set to false

In [27]:
if save:
    if test_run == False:
        df_train_set.to_csv('output/df_train_set.csv', index=False)
        df_test_set.to_csv('output/df_test_set.csv', index=False)
        print('data sets saved')
    else:
        print('attempted to save a test run')
else:
    print('unsaved test run')