In [1]:
# to do
# 0. combine train and test set - done
# 1. canonicalize smiles from Molecule - done
# 2. standardise molecules - done
# 3. extract MorganFP - done
# 4. calculate rdkit descriptors - done
# 5. export pandas dataframe as .csv, split train, test back - done

In [2]:
# 2 csvs: Train and test - done
# Columns: smiles, (example_...), (ecfp_...), (rdkit_desc_...) - done
# Export csv to Temp

In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

### Renaming function

In [4]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

In [5]:
# Reading data files

In [6]:
df_train = pd.read_csv("Data/train.csv")

In [7]:
#df

In [8]:
df_train["sol_category"].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

In [9]:
example = pd.read_csv("Data/example_predictors.csv")

In [10]:
#example

In [11]:
# google how to supress the warning below ('A value is trying to be set on a copy of a slice from a DataFrame')

In [12]:
df_example = rename(example.head(5), 'example_')

In [13]:
#df_example

In [14]:
df_test = pd.read_csv("Data/test.csv")

In [15]:
#df_test

In [16]:
df_test["sol_category"] = 5

In [17]:
#df_test

In [18]:
# 0. combine train and test set

In [19]:
df_comb = pd.DataFrame(df_train.append(df_test))
#df_combined = pd.DataFrame(df_train.append(df_test))

  df_comb = pd.DataFrame(df_train.append(df_test))


In [20]:
df_combined = df_comb.head(10)

In [21]:
# Molekülbild hinzufügen

In [22]:
df_combined = df_combined.reset_index()

In [23]:
df_combined

Unnamed: 0,index,Id,smiles,sol_category
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0


In [24]:
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

In [25]:
#df_mB

In [26]:
# 1. canonicalize smiles from Molecule

In [27]:
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

In [28]:
#df_canon

In [29]:
# 2. standardise molecules

In [30]:
# Standardazing Molecules
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

In [31]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

In [32]:
#df_standard['molecule'][1]

In [33]:
# 3. extract MorganFP

In [34]:
MFP_df = pd.DataFrame()
for mol in df_combined['molecule']:
    MFP_vect = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096)]
    MFP_list = [list(l) for l in MFP_vect]
    MFP_df = pd.concat([MFP_df,pd.DataFrame(MFP_list)])

In [35]:
#MFP_df

In [36]:
MFP_df = MFP_df.reset_index(drop=True)
#MFP_df

In [37]:
# Renaming

In [38]:
MFP_df = rename(MFP_df, 'ecfp_')
#MFP_df

In [39]:
df_combined = pd.concat([df_combined, MFP_df], axis=1)
#df_combined

In [40]:
# 4. calculate rdkit descriptors

In [41]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

In [42]:
desc_df = pd.DataFrame()
for mol in df_combined['molecule']:
    desc_vect = [desc_generator(mol)]
    desc_list = [list(l) for l in desc_vect]
    desc_df = pd.concat([desc_df,pd.DataFrame(desc_list)])

In [43]:
#desc_df

In [44]:
desc_df = desc_df.reset_index(drop=True)
#desc_df

In [45]:
desc_df = rename(desc_df, 'rdkit_desc_')
#desc_df

In [46]:
# delete index and molecule from df_AllData

In [47]:
df_AllData = pd.concat([df_combined, desc_df], axis=1)
df_AllData.drop(columns=['index','molecule'], inplace=True)
df_AllData

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Separating in train and test sets

In [48]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

df_train_set = df_AllData[df_AllData['sol_category'] <= 5]
df_test_set = df_AllData[df_AllData['sol_category'] == 5]

In [49]:
print('df_train_set')
df_train_set

df_train_set


Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [50]:
print('df_test_set')
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)
df_test_set

df_test_set


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_set.drop(columns='sol_category', inplace=True)


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


In [51]:
df_train_set.to_csv('df_train_set10.csv', index=False)
df_test_set.to_csv('df_test_set10.csv', index=False)

In [52]:
df_train_read = pd.read_csv("df_train_set10.csv")
df_train_read

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [53]:
df_test_read = pd.read_csv("df_test_set10.csv")
df_test_read

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


In [54]:
# Remove columns Index, Id and molecule
# reducing copy DataFrames