# Data curation

In [1]:
# to do
# 0. combine train and test set - done
# 1. canonicalize smiles from Molecule - done
# 2. standardise molecules - done
# 3. extract MorganFP - done
# 4. calculate rdkit descriptors - done
# 5. export pandas dataframe as .csv, split train, test back - done

In [2]:
# 2 csvs: Train and test - done
# Columns: smiles, (example_...), (ecfp_...), (rdkit_desc_...) - done
# Export csv to Temp

In [3]:
# Remove columns Index, Id and molecule
# reducing copy DataFrames

In [4]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

In [5]:
# google how to supress the warning below ('A value is trying to be set on a copy of a slice from a DataFrame')

# removing warning
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True
#AllData = False

In [7]:
# True: first 10 rows of the data
# False: everything

test_run = True
#test_run = False

In [8]:
# True: add example data to df_AllData

use_example = True
#use_example = False

## Function collection

#### Renaming function

In [9]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer

In [10]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### Descriptor calculator

In [11]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

## 0. Reading data files

In [12]:
df_train = pd.read_csv("Data/train.csv")

In [13]:
df_train["sol_category"].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

In [14]:
df_train

Unnamed: 0,Id,smiles,sol_category
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0
...,...,...,...
70706,EOS37839,O=C(NCCCc1nc(=O)[nH][nH]1)[C@H]1CCC(F)(F)C1,2
70707,EOS2088,Cc1ccc(C(=O)NC2CCCC2)cc1S(=O)(=O)N1CCOCC1,2
70708,EOS10587,COCCN1CCC(CN(C)S(=O)(=O)c2cccc(C(F)(F)F)c2)C1,2
70709,EOS40533,O=C(Nc1ccc(F)cc1)NC1CCN(C(=O)Cc2cnn(-c3ccccc3)...,2


In [15]:
example = pd.read_csv("Data/example_predictors.csv")
example.drop(columns='smiles', inplace=True)

# partition (top 10)
#if test_run:
#    df_example = rename(example.head(10), 'example_')

# full data set
#else:
df_example = rename(example, 'example_')

df_example.rename(columns={df_example.columns[0] : 'Id'}, inplace=True)

In [16]:
df_example

Unnamed: 0,Id,example_SlogP,example_SMR,example_LabuteASA,example_TPSA,example_AMW,example_ExactMW,example_NumLipinskiHBA,example_NumLipinskiHBD,example_NumRotatableBonds,...,example_VABC Volume Descriptor,example_Largest Chain,example_Largest Pi Chain,example_Petitjean Number,example_Lipinski's Rule of Five,example_Topological Polar Surface Area,example_Vertex adjacency information magnitude,example_XLogP,example_Zagreb Index,example_Rotatable Bonds Count (non terminal)
0,EOS2465,3.23142,102.1077,154.878833,44.81,365.543,365.213698,5,1,6,...,350.381172,7.0,5.0,0.461538,0.0,73.05,5.754888,1.287,126.0,6.0
1,EOS2466,1.39690,100.4422,159.722714,92.49,384.465,384.136845,8,1,5,...,318.736716,4.0,12.0,0.466667,0.0,120.73,5.906891,1.928,142.0,5.0
2,EOS2467,3.27310,101.6217,159.819085,63.27,368.481,368.221226,6,1,6,...,336.575043,2.0,7.0,0.500000,0.0,63.27,5.954196,3.070,152.0,6.0
3,EOS2468,1.49100,88.3935,134.646129,81.06,333.413,333.114712,6,1,6,...,288.474069,6.0,9.0,0.500000,0.0,89.44,5.643856,1.617,120.0,6.0
4,EOS2469,2.12884,95.8984,143.557043,111.89,359.407,359.093977,7,3,3,...,309.287363,3.0,22.0,0.500000,0.0,112.75,5.754888,1.615,136.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101013,EOS102460,2.28350,81.4081,119.364483,55.12,268.360,268.157563,3,3,7,...,268.403816,7.0,6.0,0.500000,0.0,55.12,5.392317,2.194,94.0,7.0
101014,EOS102461,-0.24220,50.7516,85.204577,124.01,213.189,213.063723,6,6,3,...,185.365920,4.0,9.0,0.500000,0.0,124.01,4.906891,-2.436,72.0,3.0
101015,EOS102462,1.68840,63.1114,97.015055,82.76,229.243,229.096360,6,2,2,...,176.998189,2.0,15.0,0.428571,0.0,82.76,5.247928,2.405,92.0,2.0
101016,EOS102463,3.26460,94.9408,145.358449,58.36,358.269,357.101082,5,1,9,...,306.589871,7.0,10.0,0.461538,0.0,58.36,5.584963,3.324,112.0,9.0


In [17]:
df_test = pd.read_csv("Data/test.csv")
df_test["sol_category"] = 5

In [18]:
#df_test

## 1. Combining train and test set

In [19]:
# partition (top 10)
if test_run:
    df_comb = pd.DataFrame(df_train.append(df_test))
    df_combined = df_comb.head(10)

# full data set
else:
    df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_comb = pd.DataFrame(df_train.append(df_test))


In [20]:
df_combined

Unnamed: 0,index,Id,smiles,sol_category
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0


In [21]:
#df_mB

## 2. Canonicalizing smiles from molecules

In [22]:
# creating molecules
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# canonical smiles
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

In [23]:
#df_canon

## 3. Standardising molecules

In [24]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

In [25]:
#df_standard['molecule'][1]

## 4. Extraction of MorganFP and rdkit descriptors

In [26]:
MFP_df = pd.DataFrame()
desc_df = pd.DataFrame()

for mol in tqdm(df_combined['molecule']):
    # MorganFP
    MFP_vect = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096)]
    MFP_list = [list(l) for l in MFP_vect]
    MFP_df = pd.concat([MFP_df,pd.DataFrame(MFP_list)])

    # rdkit descriptors
    desc_vect = [desc_generator(mol)]
    desc_list = [list(l) for l in desc_vect]
    desc_df = pd.concat([desc_df,pd.DataFrame(desc_list)])

100%|██████████| 10/10 [00:00<00:00, 16.31it/s]


#### resetting index and labeling

In [27]:
# MorganFP
MFP_df = MFP_df.reset_index(drop=True)
MFP_df = rename(MFP_df, 'ecfp_')

# rdkit descriptors
desc_df = desc_df.reset_index(drop=True)
desc_df = rename(desc_df, 'rdkit_desc_')

## 5. Combining data sets

In [28]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, MFP_df], axis=1)


# Adding example descriptors
if use_example:
    #df_combined = pd.concat([df_combined, df_example], axis=1)
    df_combined = pd.merge(df_combined, df_example, on='Id', how='inner')

# complete data set by adding rdkit descriptors
if AllData:
    df_AllData = pd.concat([df_combined, desc_df], axis=1)
    df_AllData.drop(columns=['index','molecule'], inplace=True)


In [29]:
if AllData == False:
    df_combined

In [30]:
df_combined

Unnamed: 0,index,Id,smiles,sol_category,molecule,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,...,example_VABC Volume Descriptor,example_Largest Chain,example_Largest Pi Chain,example_Petitjean Number,example_Lipinski's Rule of Five,example_Topological Polar Surface Area,example_Vertex adjacency information magnitude,example_XLogP,example_Zagreb Index,example_Rotatable Bonds Count (non terminal)
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x18306b4a0>,0,0,0,0,0,...,329.228869,3.0,12.0,0.5,0.0,79.18,6.0,2.599,154.0,3.0
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x18306b580>,0,0,0,0,0,...,319.355435,5.0,5.0,0.5,0.0,64.68,5.523562,1.417,110.0,6.0
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x18306b5f0>,0,0,0,0,0,...,257.448332,7.0,13.0,0.5,0.0,91.37,5.392317,0.645,100.0,4.0
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x18306b660>,0,0,0,0,0,...,350.252336,3.0,7.0,0.461538,1.0,57.97,5.643856,6.034,126.0,5.0
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x18306b6d0>,0,0,0,0,0,...,305.250156,6.0,14.0,0.5,0.0,69.04,5.754888,3.222,126.0,6.0
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,<rdkit.Chem.rdchem.Mol object at 0x18306b740>,0,0,0,0,0,...,280.145319,4.0,11.0,0.5,0.0,98.0,5.70044,3.44,122.0,5.0
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,<rdkit.Chem.rdchem.Mol object at 0x18306b7b0>,0,0,0,0,0,...,365.276245,5.0,8.0,0.5,0.0,113.96,6.0,0.966,158.0,5.0
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,<rdkit.Chem.rdchem.Mol object at 0x18306b820>,0,0,0,0,0,...,347.751275,2.0,8.0,0.466667,0.0,53.01,5.906891,2.399,144.0,5.0
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,<rdkit.Chem.rdchem.Mol object at 0x18306b890>,0,0,0,0,0,...,317.986415,5.0,13.0,0.5,0.0,117.79,5.906891,1.167,144.0,5.0
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,<rdkit.Chem.rdchem.Mol object at 0x18306b510>,0,0,0,0,0,...,340.506394,3.0,26.0,0.461538,0.0,117.13,6.087463,2.174,170.0,4.0


In [31]:
df_AllData

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## 6. Separating train and test data sets

In [32]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

In [33]:
# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


#### Printing data files

In [34]:
print('df_train_set')
df_train_set

df_train_set


Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [35]:
print('df_test_set')
df_test_set

df_test_set


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


## 7. Saving data sets as csv files

In [36]:
df_train_set.to_csv('Temp/df_train_set.csv', index=False)
df_test_set.to_csv('Temp/df_test_set.csv', index=False)

#### Reading saved csv files

In [37]:
df_train_read = pd.read_csv("Temp/df_train_set.csv")
df_train_read

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [38]:
df_test_read = pd.read_csv("Temp/df_test_set.csv")
df_test_read

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
