In [34]:
import pandas as pd
import numpy as np
import time
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [35]:
def get_mols(smiles):
        try:
            mol = Chem.MolFromSmiles(smiles)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',smiles,'\n')
            mol = None
        return mol

In [36]:
def get_morganfps(m):
        try:
            morganfps = list()
            mfps = AllChem.GetMorganFingerprintAsBitVect(m,2, nBits=1024)
            bytes = DataStructs.cDataStructs.BitVectToText(mfps)
            for byte in bytes:
                morganfps.append(byte)
#             print(".",end="")
        except Exception as e:
            print(e,'\t',m,'\n')
            morganfps = None
        return morganfps

In [37]:
def process(df,label):
    begin = time.time()
    smiles_list = pd.Series(list(df['Canonical_SMILES']))
    mfps_list = list()
    df['Mol_Objects'] = smiles_list.map(get_mols)
    mol_list = pd.Series(list(df['Mol_Objects']))
    for mol in mol_list:
        temp_list = list()
        for i in range(1024):
            temp_list.append(get_morganfps(mol)[i])
        mfps_list.append(temp_list)
    df_temp = pd.DataFrame([x for x in mfps_list])
    df = pd.concat([df,df_temp],axis=1)
    end = time.time()
    elapsed = end - begin
    print(elapsed)
    return df

In [38]:
df = pd.read_csv('./DILI/final/DILI_Final.csv')
df

Unnamed: 0,ID,Canonical_SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0
...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0


In [39]:
df = process(df,'DILI')
df

397.55951976776123


Unnamed: 0,ID,Canonical_SMILES,DILI,Mol_Objects,0,1,2,3,4,5,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,1,1,1,...,0,0,0,0,0,1,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,"<img data-content=""rdkit/molecule"" src=""data:i...",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df_a=df[['ID','Canonical_SMILES','DILI']]
df_b=df.loc[:,0:1023]
recolumn = []
for i in range(1024):
    recolumn.append(f'MFPS_{i+1}')
df_b.columns = recolumn
df = pd.concat([df_a,df_b],axis=1)
df

Unnamed: 0,ID,Canonical_SMILES,DILI,MFPS_1,MFPS_2,MFPS_3,MFPS_4,MFPS_5,MFPS_6,MFPS_7,...,MFPS_1015,MFPS_1016,MFPS_1017,MFPS_1018,MFPS_1019,MFPS_1020,MFPS_1021,MFPS_1022,MFPS_1023,MFPS_1024
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,0,0,0,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
df.to_csv('./DILI/final/DILI_MFPS.csv')