In [28]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

## Parameters

#### specifying data set features

In [29]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True
#AllData = False

In [30]:
# True: add example data to the data set

use_example = True
#use_example = False

#### specifying radius of morgan finger prints

In [31]:
mfp_r = 1

#### test run

In [32]:
# True: first 10 rows of the data
# False: everything

test_run = False
#test_run = True

#### removing columns with unique values

In [33]:
# True: deletes all columns with unique values
# False: keeps the data set without removing any columns

delun = False
#delun = True

#### save as csv

In [34]:
# True: save data sets as csv files

save = True
#save = False

## Function collection

#### Renaming function

In [35]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer

In [36]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### MorganFP generator

In [37]:
def MFP_generator(mol):
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=mfp_r, nBits=4096))

#### Descriptor calculator

In [38]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

#### Descriptor name generator

In [39]:
def desc_name_generator():
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][0])
    return desc_list

## 0. Reading data files from csv

#### reading training data set

In [40]:
df_train = pd.read_csv("Data/train.csv")

#### reading example predictor data set

In [41]:
example = pd.read_csv("Data/example_predictors.csv")
example.drop(columns='smiles', inplace=True)

df_example = rename(example, 'example_')

df_example.rename(columns={df_example.columns[0] : 'Id'}, inplace=True)

#### reading test data set

In [42]:
df_test = pd.read_csv("Data/test.csv")

# dummy value for the solubility category
df_test["sol_category"] = 5

## 1. Combining train and test data sets

In [43]:
# partition (top 10)
if test_run:
    df_comb = pd.DataFrame(df_train.append(df_test))
    df_combined = df_comb.head(10)

# full data set
else:
    df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_combined = pd.DataFrame(df_train.append(df_test))


## 2. Canonicalizing smiles from molecules

In [44]:
# creating molecules from smiles
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# creating canonical smiles from molecules
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

KeyboardInterrupt: 

## 3. Standardising molecules

In [None]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

## 4. Extraction of MorganFP and rdkit descriptors

In [None]:
df_combined_copy = df_combined.copy(deep=True)

#### generating MorganFP

In [None]:
df_combined_copy["FP"] = df_combined_copy["molecule"].apply(MFP_generator)
#df_combined_copy

#### reformating morgan finger print bits into seperate columns

In [None]:
# generate a Numpy array of sample size and fps
fp_arr = np.stack(df_combined_copy["FP"])

#convert to df
df_fp = pd.DataFrame(fp_arr)

#### Descriptors

In [None]:
if AllData:
    df_combined_copy["desc"] = df_combined_copy["molecule"].apply(desc_generator)
    
    # generate a Numpy array of sample size and fps
    desc_arr = np.stack(df_combined_copy["desc"])

    #convert to df
    df_desc = pd.DataFrame(desc_arr)

#### resetting index and labeling, optional : caculating descriptors

In [None]:
# adding MorganFP
df_fp = df_fp.reset_index(drop=True)
df_fp = rename(df_fp, 'ecfp_')

# adding rdkit descriptors
if AllData:
    df_desc = df_desc.reset_index(drop=True)
    df_desc.columns=desc_name_generator()
    df_desc = rename(df_desc, 'rdkit_desc_')
#df_desc

## 5. Combining data sets

In [None]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, df_fp], axis=1)

# Adding example descriptors
if use_example:
    #df_combined = pd.concat([df_combined, df_example], axis=1)
    df_combined = pd.merge(df_combined, df_example, on='Id', how='inner')

# complete data set by adding rdkit descriptors
if AllData:
    df_AllData = pd.concat([df_combined, df_desc], axis=1)
    df_AllData.drop(columns=['index','molecule'], inplace=True)

#### delete columns with unique values

In [None]:
if delun:
    if AllData:
        len_un = len(df_AllData.columns)
        df_AllData = df_AllData.loc[:,df_AllData.nunique()!=1]
        num_drop_unique = len_un - len(df_AllData.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')
    else:
        len_un = len(df_combined.columns)
        df_combined = df_combined.loc[:,df_combined.nunique()!=1]
        num_drop_unique = len_un - len(df_combined.columns)
        print(f'number of dropped collumns with identical values: {num_drop_unique}')

## 6. Separating train and test data sets on dummy value

In [None]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_set.drop(columns='sol_category', inplace=True)


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_fr_sulfide,rdkit_desc_fr_sulfonamd,rdkit_desc_fr_sulfone,rdkit_desc_fr_term_acetylene,rdkit_desc_fr_tetrazole,rdkit_desc_fr_thiazole,rdkit_desc_fr_thiocyan,rdkit_desc_fr_thiophene,rdkit_desc_fr_unbrch_alkane,rdkit_desc_fr_urea


## 7. Saving data sets as csv files

In [None]:
if save:
    if test_run == False:
        df_train_set.to_csv('df_train_set.csv', index=False)
        df_test_set.to_csv('df_test_set.csv', index=False)
    else:
        print('test run')