In [1]:
import pandas as pd
import numpy as np

In [4]:
trainDF = pd.read_csv('/ihome/gidakwo/ml_files/Tox21_Restart/trainDF.csv', header=0).drop('Unnamed: 0', axis = 1)
trainDF.head()

Unnamed: 0,smiles_parent,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,Br,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BrC(Br)Br,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BrC(Br)C(Br)(Br)Br,0.0,0.0,,,0.0,0.0,0.0,,0.0,,0.0,1.0
3,BrC(Br)C(Br)Br,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,0.0,0.0,0.0,,,,,,0.0,,1.0,


In [5]:
testDF = pd.read_csv('/ihome/gidakwo/ml_files/Tox21_Restart/testDF.csv', header=0).drop('Unnamed: 0', axis = 1)
testDF.head()

Unnamed: 0,smiles_parent,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,BrCC(Br)c1ccccc1,0.0,0.0,,0.0,,,0.0,,0.0,0.0,0.0,0.0
1,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Brc1c(Br)c(Br)c2[nH]nnc2c1Br,0.0,,0.0,0.0,0.0,0.0,,,0.0,0.0,,
3,Brc1cc(Br)cc(Br)c1,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0
4,Brc1ccc(C2CN3CCSC3=N2)cc1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0


In [10]:
testDF.values[:,0][0]

'BrCC(Br)c1ccccc1'

In [13]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

nms = [ x[0] for x in Descriptors._descList ] #list of all RDKit Descriptors
calc = MoleculeDescriptors.MolecularDescriptorCalculator( nms )
keep_cols = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
            'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']

def featurize(data):
    
    mols = []
    X_mdesc = []
    X_fp = []
    X_maccs = []
    y = [] 
    for i in range(len(data)):
        smi = data[:,0][i]
        outcome = data[:,1:13][i]

        smi = Chem.MolFromSmiles(smi)
        if type(smi) != type(None):
            desc = calc.CalcDescriptors(smi)
            
            maccs = MACCSkeys.GenMACCSKeys(smi)
            
            fp = AllChem.GetMorganFingerprintAsBitVect(smi, 2, nBits=1024)
            fp_vect = np.zeros((1,))
            DataStructs.ConvertToNumpyArray(fp, fp_vect)

            mols.append(smi)
            X_mdesc.append(desc)
            X_fp.append(fp_vect)
            X_maccs.append(maccs)
            y.append(outcome)
            
        else:
            print("Failed for ID: %s"%smi)
    #print("Details for %s "% desc)
    print("Imported smiles: %s"%len(data))
    print("Smiles converted to MolDesc: %s"%len(mols))
    X1 = pd.DataFrame(np.asarray(X_mdesc))#descriptors
    X2 = pd.DataFrame(np.asarray(X_fp))#Morgan FP
    X3 = pd.DataFrame(np.asarray(X_maccs))#MACCS FP
    
    y = pd.DataFrame(np.asarray(y), columns = keep_cols)
    X_ = pd.concat([X1, X2, X3, y], axis = 1)
    return X_

In [14]:
testDF_fp = featurize(testDF.values)
trainDF_fp = featurize(trainDF.values)

Imported smiles: 627
Smiles converted to MolDesc: 627
Imported smiles: 7683
Smiles converted to MolDesc: 7683


In [15]:
testDF_fp.to_csv('testDF_rdkit.csv', index=False)
trainDF_fp.to_csv('trainDF_rdkit.csv', index=False)