# Data curation

In [1]:
# to do
# 0. combine train and test set - done
# 1. canonicalize smiles from Molecule - done
# 2. standardise molecules - done
# 3. extract MorganFP - done
# 4. calculate rdkit descriptors - done
# 5. export pandas dataframe as .csv, split train, test back - done

In [2]:
# 2 csvs: Train and test - done
# Columns: smiles, (example_...), (ecfp_...), (rdkit_desc_...) - done
# Export csv to Temp

In [3]:
# Remove columns Index, Id and molecule
# reducing copy DataFrames

In [4]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

In [5]:
# google how to supress the warning below ('A value is trying to be set on a copy of a slice from a DataFrame')

# removing warning
pd.options.mode.chained_assignment = None  # default='warn'

## Parameters

In [6]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True
#AllData = False

In [7]:
# True: first 10 rows of the data
# False: everything

#test_run = True
test_run = False

In [8]:
# True: add example data to df_AllData

use_example = True
#use_example = False

In [9]:
# True: save data sets as csv files

save = True
#save = False

## Function collection

#### Renaming function

In [10]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer

In [11]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### MorganFP generator

In [12]:
def MFP_generator(mol):
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096))

#### Descriptor calculator

In [13]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

#### Descriptor name generator

In [14]:
def desc_name_generator():
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

## 0. Reading data files

In [15]:
df_train = pd.read_csv("Data/train.csv")

In [16]:
df_train["sol_category"].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

In [17]:
#df_train

In [18]:
example = pd.read_csv("Data/example_predictors.csv")
example.drop(columns='smiles', inplace=True)

# partition (top 10)
#if test_run:
#    df_example = rename(example.head(10), 'example_')

# full data set
#else:
df_example = rename(example, 'example_')

df_example.rename(columns={df_example.columns[0] : 'Id'}, inplace=True)

In [19]:
#df_example

In [20]:
df_test = pd.read_csv("Data/test.csv")
df_test["sol_category"] = 5

In [21]:
#df_test

## 1. Combining train and test set

In [22]:
# partition (top 10)
if test_run:
    df_comb = pd.DataFrame(df_train.append(df_test))
    df_combined = df_comb.head(10)

# full data set
else:
    df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_combined = pd.DataFrame(df_train.append(df_test))


In [23]:
df_combined

Unnamed: 0,index,Id,smiles,sol_category
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0
...,...,...,...,...
101013,30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,5
101014,30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,5
101015,30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,5
101016,30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,5


## 2. Canonicalizing smiles from molecules

In [24]:
# creating molecules
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# canonical smiles
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

## 3. Standardising molecules

In [25]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

In [26]:
#df_standard['molecule'][1]

## 4. Extraction of MorganFP and rdkit descriptors

In [27]:
df_combined_copy = df_combined.copy(deep=True)

#### MorganFP

In [28]:
df_combined_copy["FP"] = df_combined_copy["molecule"].apply(MFP_generator)
df_combined_copy

Unnamed: 0,index,Id,smiles,sol_category,molecule,FP
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdcf0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdd60>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x1877fddd0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdeb0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdf20>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
101013,30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,5,<rdkit.Chem.rdchem.Mol object at 0x1882f69e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101014,30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6a50>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101015,30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6ac0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
101016,30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6b30>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [29]:
# generate a Numpy array of sample size and fps
fp_arr = np.stack(df_combined_copy["FP"])

#convert to df
df_fp = pd.DataFrame(fp_arr)
df_fp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101014,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
101015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Descriptors

In [30]:
if AllData:
    df_combined_copy["desc"] = df_combined_copy["molecule"].apply(desc_generator)
    df_combined_copy

In [31]:
df_desc = pd.DataFrame()
if AllData:
    # generate a Numpy array of sample size and fps
    desc_arr = np.stack(df_combined_copy["desc"])

    #convert to df
    df_desc = pd.DataFrame(desc_arr)

#### resetting index and labeling

In [32]:
# MorganFP
df_fp = df_fp.reset_index(drop=True)
df_fp = rename(df_fp, 'ecfp_')

# rdkit descriptors
if AllData:
    df_desc = df_desc.reset_index(drop=True)
    df_desc.columns=desc_name_generator()
    df_desc = rename(df_desc, 'rdkit_desc_')
df_desc

Unnamed: 0,rdkit_desc_MaxEStateIndex,rdkit_desc_MinEStateIndex,rdkit_desc_MaxAbsEStateIndex,rdkit_desc_MinAbsEStateIndex,rdkit_desc_qed,rdkit_desc_MolWt,rdkit_desc_HeavyAtomMolWt,rdkit_desc_ExactMolWt,rdkit_desc_NumValenceElectrons,rdkit_desc_NumRadicalElectrons,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,13.751480,-0.805113,13.751480,0.046085,0.732517,399.405,380.253,399.161915,150.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,12.524029,0.234919,12.524029,0.234919,0.807124,323.506,294.274,323.203134,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,12.317465,-0.223009,12.317465,0.019061,0.899102,291.376,274.240,291.104148,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,12.357761,-0.382235,12.357761,0.149177,0.698158,364.942,339.742,364.137612,130.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.704152,-0.151369,12.704152,0.131009,0.750601,336.395,316.235,336.158626,128.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,8.792413,0.730413,8.792413,0.730413,0.835277,300.450,272.226,300.231397,120.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101014,12.599120,-0.572386,12.599120,0.115993,0.879151,317.389,294.205,317.173942,124.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101015,12.929550,-0.388142,12.929550,0.150145,0.715329,390.443,368.267,390.169191,148.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101016,12.916304,-0.513227,12.916304,0.025342,0.684949,362.429,340.253,362.163043,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 5. Combining data sets

In [33]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, df_fp], axis=1)

# Adding example descriptors
if use_example:
    #df_combined = pd.concat([df_combined, df_example], axis=1)
    df_combined = pd.merge(df_combined, df_example, on='Id', how='inner')

# complete data set by adding rdkit descriptors
if AllData:
    df_AllData = pd.concat([df_combined, df_desc], axis=1)
    df_AllData.drop(columns=['index','molecule'], inplace=True)


In [34]:
df_combined

Unnamed: 0,index,Id,smiles,sol_category,molecule,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,...,example_VABC Volume Descriptor,example_Largest Chain,example_Largest Pi Chain,example_Petitjean Number,example_Lipinski's Rule of Five,example_Topological Polar Surface Area,example_Vertex adjacency information magnitude,example_XLogP,example_Zagreb Index,example_Rotatable Bonds Count (non terminal)
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdcf0>,0,0,0,0,0,...,329.228869,3.0,12.0,0.500000,0.0,79.18,6.000000,2.599,154.0,3.0
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdd60>,0,0,0,0,0,...,319.355435,5.0,5.0,0.500000,0.0,64.68,5.523562,1.417,110.0,6.0
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x1877fddd0>,0,0,0,0,0,...,257.448332,7.0,13.0,0.500000,0.0,91.37,5.392317,0.645,100.0,4.0
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdeb0>,0,0,0,0,0,...,350.252336,3.0,7.0,0.461538,1.0,57.97,5.643856,6.034,126.0,5.0
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x1877fdf20>,0,0,0,0,0,...,305.250156,6.0,14.0,0.500000,0.0,69.04,5.754888,3.222,126.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,5,<rdkit.Chem.rdchem.Mol object at 0x1882f69e0>,0,0,0,0,0,...,315.076012,4.0,8.0,0.500000,0.0,42.30,5.523562,1.701,104.0,7.0
101014,30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6a50>,0,0,0,0,0,...,300.797488,3.0,12.0,0.461538,0.0,56.25,5.643856,0.283,118.0,5.0
101015,30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6ac0>,0,0,0,0,0,...,361.835420,3.0,23.0,0.500000,0.0,83.03,6.000000,3.557,152.0,4.0
101016,30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,5,<rdkit.Chem.rdchem.Mol object at 0x1882f6b30>,0,0,0,0,0,...,342.478358,6.0,15.0,0.500000,0.0,69.64,5.906891,4.446,146.0,5.0


In [35]:
df_AllData

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,5,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101014,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,5,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101015,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,5,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101016,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,5,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 6. Separating train and test data sets

In [36]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

In [37]:
# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Printing data files

In [38]:
print('df_train_set')
df_train_set

df_train_set


Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70706,EOS37839,O=C(NCCCc1nc(=O)[nH][nH]1)[C@H]1CCC(F)(F)C1,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70707,EOS2088,Cc1ccc(C(=O)NC2CCCC2)cc1S(=O)(=O)N1CCOCC1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70708,EOS10587,COCCN1CCC(CN(C)S(=O)(=O)c2cccc(C(F)(F)F)c2)C1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70709,EOS40533,O=C(Nc1ccc(F)cc1)NC1CCN(C(=O)Cc2cnn(-c3ccccc3)...,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
print('df_test_set')
df_test_set

df_test_set


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
70711,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70712,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70713,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70714,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70715,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101014,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101015,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101016,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 7. Saving data sets as csv files

In [40]:
if save:
    if test_run:
        df_train_set.to_csv('Temp/df_train_set10.csv', index=False)
        df_test_set.to_csv('Temp/df_test_set10.csv', index=False)
    else:
        df_train_set.to_csv('Temp/df_train_set.csv', index=False)
        df_test_set.to_csv('Temp/df_test_set.csv', index=False)

#### Reading saved csv files

In [41]:
if save:
    if test_run:
        df_train_read = pd.read_csv("Temp/df_train_set10.csv")
    else:
        df_train_read = pd.read_csv("Temp/df_train_set.csv")
df_train_read

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70706,EOS37839,O=C(NCCCc1nc(=O)[nH][nH]1)[C@H]1CCC(F)(F)C1,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70707,EOS2088,Cc1ccc(C(=O)NC2CCCC2)cc1S(=O)(=O)N1CCOCC1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70708,EOS10587,COCCN1CCC(CN(C)S(=O)(=O)c2cccc(C(F)(F)F)c2)C1,2,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70709,EOS40533,O=C(Nc1ccc(F)cc1)NC1CCN(C(=O)Cc2cnn(-c3ccccc3)...,2,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
if save:
    if test_run:
        df_test_read = pd.read_csv("Temp/df_test_set10.csv")
    else:
        df_test_read = pd.read_csv("Temp/df_test_set.csv")
df_test_read

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea
0,EOS10000,Cc1n[nH]nc1C(=O)N(C)CC1CCN(Cc2ccccc2)C1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,EOS100001,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,EOS100004,Cc1ccc(-c2ccc(F)cc2COc2ccc(CCC(=O)O)cc2)cc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EOS100005,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,EOS100008,Cl.c1ccc2c(CC3=NCCN3)cccc2c1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30302,EOS97822,CN(C)CCN1CCC(CNCc2ccc(C#N)cc2)CC1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30303,EOS97834,Cc1cn(C[C@H](O)CN2CCOCC2)c(=O)n1-c1ccccc1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30304,EOS97889,Cc1ccc(C(=O)Nc2c(N3CCOCC3)nc(-c3ccccc3)[nH]c2=...,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30305,EOS97969,CC1Cc2cccc3c(O)c(C(=O)NCCCc4ccccc4)c(=O)n1c23,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
