In [14]:
import pandas as pd
import numpy as np
import time
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [15]:
def get_fingerprints(smiles):
        try:
            fps = Chem.MolFromSmiles(smiles)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',smiles,'\n')
            fps = None
        return fps

In [16]:
def get_morganfps(fps):
        try:
            morganfps = list()
            mfps = AllChem.GetMorganFingerprintAsBitVect(fps,2)
            bytes = DataStructs.cDataStructs.BitVectToText(mfps)
            for byte in bytes:
                morganfps.append(byte)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',fps,'\n')
            morganfps = None
        return morganfps

In [17]:
def process(df,label):
    begin = time.time()
    smiles_list = pd.Series(list(df['Canonical_SMILES']))
    mfps_list = list()
    df['Fingerprints'] = smiles_list.map(get_fingerprints)
    fps_list = pd.Series(list(df['Fingerprints']))
    for fps in fps_list:
        temp_list = list()
        for i in range(2048):
            temp_list.append(get_morganfps(fps)[i])
        mfps_list.append(temp_list)
    df_temp = pd.DataFrame([x for x in mfps_list])
    df = pd.concat([df,df_temp],axis=1)
    end = time.time()
    elapsed = end - begin
    print(elapsed)
    return df

In [7]:
df = pd.read_csv('./DILI/final/DILI_Final.csv')
df

Unnamed: 0,ID,Canonical_SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0
...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0


In [8]:
df = process(df,'DILI')
df

1069.66073513031


Unnamed: 0,ID,Canonical_SMILES,DILI,Fingerprints,0,1,2,3,4,5,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_a=df[['ID','Canonical_SMILES','DILI']]
df_b=df.loc[:,0:2047]
df = pd.concat([df_a,df_b],axis=1)
df

Unnamed: 0,ID,Canonical_SMILES,DILI,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.to_csv('./DILI/final/DILI_MFPS.csv')