# Data curation

In [1]:
# to do
# 0. combine train and test set - done
# 1. canonicalize smiles from Molecule - done
# 2. standardise molecules - done
# 3. extract MorganFP - done
# 4. calculate rdkit descriptors - done
# 5. export pandas dataframe as .csv, split train, test back - done

In [2]:
# 2 csvs: Train and test - done
# Columns: smiles, (example_...), (ecfp_...), (rdkit_desc_...) - done
# Export csv to Temp

In [3]:
# Remove columns Index, Id and molecule
# reducing copy DataFrames

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.rdBase import BlockLogs
from rdkit.Chem.MolStandardize import rdMolStandardize

In [5]:
# google how to supress the warning below ('A value is trying to be set on a copy of a slice from a DataFrame')

# removing warning
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
# True: complete data set with MorganFP and rdkit descriptors
# False: complete data set only with MorganFP

AllData = True

## Function collection

#### Renaming function

In [7]:
def rename(df_name, new_name):
    df = df_name.copy(deep=True)
    for i in range(len(df_name.columns)):
        name = new_name + str(df_name.columns[i])
        df.rename(columns={df_name.columns[i] : name}, inplace=True)
    return df

#### Molecule standardizer

In [8]:
def transform(smiles, neutralize=True):
        mol = Chem.MolFromSmiles(smiles)
        block = BlockLogs() # Block all RDkit logging
        
        # Normalizing functional groups
        # https://molvs.readthedocs.io/en/latest/guide/standardize.html
        clean_mol = rdMolStandardize.Cleanup(mol) 
        # Get parents fragments
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # Neutralise
        if neutralize:
            uncharger = rdMolStandardize.Uncharger()
            uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
        else:
            uncharged_parent_clean_mol = parent_clean_mol
        del block # Release logging block to previous state
        return(uncharged_parent_clean_mol)

#### Descriptor calculator

In [9]:
def desc_generator(mol):
    desc_list = []
    for i in range(len(Chem.Descriptors.descList)):
        desc_list.append(Chem.Descriptors.descList[i][1](mol))
    return desc_list

## 0. Reading data files

In [10]:
df_train = pd.read_csv("Data/train.csv")

In [11]:
df_train["sol_category"].value_counts()

2    65835
1     2835
0     2041
Name: sol_category, dtype: int64

In [12]:
example = pd.read_csv("Data/example_predictors.csv")
df_example = rename(example.head(5), 'example_')

In [13]:
#df_example

In [14]:
df_test = pd.read_csv("Data/test.csv")
df_test["sol_category"] = 5

In [15]:
#df_test

## 1. Combining train and test set

In [16]:
# partition
df_comb = pd.DataFrame(df_train.append(df_test))
df_combined = df_comb.head(10)

# full data set
#df_combined = pd.DataFrame(df_train.append(df_test))

# resetting index
df_combined = df_combined.reset_index()

  df_comb = pd.DataFrame(df_train.append(df_test))


In [17]:
#bdf_combined

In [18]:
#df_mB

## 2. Canonicalizing smiles from molecules

In [19]:
# creating molecules
df_combined['molecule'] = df_combined['smiles'].apply(Chem.MolFromSmiles)

# canonical smiles
df_combined['smiles'] = df_combined['molecule'].apply(Chem.MolToSmiles)

In [20]:
#df_canon

## 3. Standardising molecules

In [21]:
df_combined['molecule'] = df_combined['smiles'].apply(transform)

In [22]:
#df_standard['molecule'][1]

## 4. Extraction of MorganFP and rdkit descriptors (try to do without for loop, since it's slow. Numpy and pandas funciton are built on c++, which is much faster)

In [23]:
MFP_df = pd.DataFrame()
desc_df = pd.DataFrame()

"""
for mol in df_combined['molecule']:
    # MorganFP
    MFP_vect = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096)]
    MFP_list = [list(l) for l in MFP_vect]
    MFP_df = pd.concat([MFP_df,pd.DataFrame(MFP_list)])

    # rdkit descriptors
    desc_vect = [desc_generator(mol)]
    desc_list = [list(l) for l in desc_vect]
"""



"\nfor mol in df_combined['molecule']:\n    # MorganFP\n    MFP_vect = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096)]\n    MFP_list = [list(l) for l in MFP_vect]\n    MFP_df = pd.concat([MFP_df,pd.DataFrame(MFP_list)])\n\n    # rdkit descriptors\n    desc_vect = [desc_generator(mol)]\n    desc_list = [list(l) for l in desc_vect]\n"

In [24]:
df_combined_copy = df_combined.copy(deep=True)
df_combined_copy

Unnamed: 0,index,Id,smiles,sol_category,molecule
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a490>
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a500>
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a570>
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a5e0>
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a650>
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a6c0>
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a730>
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a7a0>
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a810>
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a880>


In [25]:
def dummy_df(mol):
    return(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=4096))
df_combined_copy["FP"] = df_combined_copy["molecule"].apply(dummy_df)



In [26]:
df_combined_copy

Unnamed: 0,index,Id,smiles,sol_category,molecule,FP
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a490>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a500>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a570>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a5e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a650>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a6c0>,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a730>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a7a0>,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a810>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a880>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
fp_arr = np.stack(df_combined_copy["FP"]) #this will generate a Numpy array of sample size and fps

In [28]:
fp_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [29]:
#convert to df
df_fp = pd.DataFrame(fp_arr)
df_fp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
#rename cols more intuitively
df_fp.columns = [f"FP_bit_{i}" for i in range(1, 4097)]
df_fp.head()

Unnamed: 0,FP_bit_1,FP_bit_2,FP_bit_3,FP_bit_4,FP_bit_5,FP_bit_6,FP_bit_7,FP_bit_8,FP_bit_9,FP_bit_10,...,FP_bit_4087,FP_bit_4088,FP_bit_4089,FP_bit_4090,FP_bit_4091,FP_bit_4092,FP_bit_4093,FP_bit_4094,FP_bit_4095,FP_bit_4096
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df_combined_copy

Unnamed: 0,index,Id,smiles,sol_category,molecule,FP
0,0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a490>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a500>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a570>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a5e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a650>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a6c0>,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
6,6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a730>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
7,7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a7a0>,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a810>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,<rdkit.Chem.rdchem.Mol object at 0x18d45a880>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### removing index and labeling

In [None]:
# MorganFP
MFP_df = MFP_df.reset_index(drop=True)
MFP_df = rename(MFP_df, 'ecfp_')

# rdkit descriptors
desc_df = desc_df.reset_index(drop=True)
desc_df = rename(desc_df, 'rdkit_desc_')

## 5. Combining data sets

In [None]:
# Combined data with only MorganFP
df_combined = pd.concat([df_combined, MFP_df], axis=1)

# complete data set
df_AllData = pd.concat([df_combined, desc_df], axis=1)
df_AllData.drop(columns=['index','molecule'], inplace=True)


In [None]:
df_AllData

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## 6. Separating train and test data sets

In [None]:
df_train_set = pd.DataFrame()
df_test_set = pd.DataFrame()

In [None]:
# complete data set with MorganFP and rdkit descriptors
if AllData == True:
    df_train_set = df_AllData[df_AllData['sol_category'] <= 4]
    df_test_set = df_AllData[df_AllData['sol_category'] == 5]

# data set with only MorganFP    
elif AllData == False:
    df_train_set = df_combined[df_combined['sol_category'] <= 4]
    df_test_set = df_combined[df_combined['sol_category'] == 5]

# Restructuring test data
df_test_set.drop(columns='sol_category', inplace=True)
df_test_set.reset_index(drop=True)

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


#### Printing data files

In [None]:
print('df_train_set')
df_train_set

df_train_set


Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
print('df_test_set')
df_test_set

df_test_set


Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207


## 7. Saving data sets as csv files

In [None]:
df_train_set.to_csv('Temp/df_train_set.csv', index=False)
df_test_set.to_csv('Temp/df_test_set.csv', index=False)

#### Reading saved csv files

In [None]:
df_train_read = pd.read_csv("Temp/df_train_set10.csv")
df_train_read

Unnamed: 0,Id,smiles,sol_category,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
0,EOS12286,Cc1nc(N2CCN(C(=O)Nc3ccc(F)cc3F)CC2)cc(-n2ccnc2)n1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,EOS85869,CCN(CC)[C@H]1CCN(C(=O)Cc2nc(C(C)C)c(C)s2)C1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,EOS85435,CNC(=O)CNC(=O)c1c(-n2cccc2)sc(C)c1C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,EOS102302,CC(C)(C)c1ccc(CSc2cnn(C(C)(C)C)c(=O)c2Cl)cc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,EOS64213,CC[C@H](NC(=O)c1ccnc(-n2ccnc2)c1)c1ccccc1OC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,EOS68602,Cn1c(SCC(=O)Nc2ccccc2F)nnc1-c1ccncc1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,EOS90323,CCN(Cc1ccc2c(c1)OCO2)C(=O)C1=NN([C@H]2CCS(=O)(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,EOS4636,COc1ccc(CN2C[C@@H]3CC[C@H](C2)N(Cc2ccccc2)C3=O...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,EOS72860,C[C@@H]1Oc2ccc(NC(=O)COCc3nc4ccccc4s3)cc2NC1=O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,EOS18993,COc1cccc(-c2nc3n(c2C(=O)Nc2ccc4c(c2)OCO4)CCS3(...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
df_test_read = pd.read_csv("Temp/df_test_set10.csv")
df_test_read

Unnamed: 0,Id,smiles,ecfp_0,ecfp_1,ecfp_2,ecfp_3,ecfp_4,ecfp_5,ecfp_6,ecfp_7,...,rdkit_desc_198,rdkit_desc_199,rdkit_desc_200,rdkit_desc_201,rdkit_desc_202,rdkit_desc_203,rdkit_desc_204,rdkit_desc_205,rdkit_desc_206,rdkit_desc_207
