In [1]:
import pandas as pd
import pubchempy as pcp
import time
from rdkit.Chem.MolStandardize.rdMolStandardize import StandardizeSmiles

In [2]:
def setID(df,num):
    id = []
    for i in range(len(df)):
        text=str(i+1)
        id.append(f'ref{num}_{text.zfill(5)}')
    df['ID'] = id
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
def standardize(smiles):
    if smiles != 'none':
        try:
            s_smiles = StandardizeSmiles(smiles)
            print(".",end="")
        except Exception as e:
            print(e,'\t',smiles,'\n')
            s_smiles = None
        return s_smiles

In [5]:
#DILI-Only Function
def get_smiles(name):
    if name != 'none':
        try:
            smiles = pcp.get_compounds(name, namespace='name', as_dataframe=True)['canonical_smiles'].values[0]
            print(".",end="")
        except Exception as e:
            print(e,'\t',name,'\n')
            smiles = 'none'
        return smiles

-----

## DILI

In [6]:
a = pd.read_excel('./DILI/data/ma.xlsx')
a = setID(a,1)[['ID','smile','dili']]
a.rename(columns={'smile':'SMILES','dili':'DILI'}, inplace = True)
a

Unnamed: 0,ID,SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c2ccccc2,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c2ccccc2)c3ccc(Cl)cc3,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13,1
3,ref1_00004,CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24,1
4,ref1_00005,NC(=O)C([C@@H]1CCN(CCc2ccc3OCCc3c2)C1)(c4ccccc...,0
...,...,...,...
474,ref1_00475,CC(=O)NC1C(C(C(OC1O)CO)OS(=O)(=O)[O-])OC2C(C(C...,1
475,ref1_00476,CCOC1C(CC(=O)O1)NC(=O)C2CCCN3N2C(=O)C(CCC3=O)N...,1
476,ref1_00477,CCCCN1C(=O)C(NC(=O)C12CCN(CC2)CC3=CC=C(C=C3)OC...,1
477,ref1_00478,CON=C(C1=CC=C(C=C1)C2=CC=C(O2)C3=CC=C(C=C3)C(=...,1


In [7]:
b = pd.read_excel('./DILI/data/li.xlsx')
b = setID(b,2)[['ID','CompoundName','DILIst Classification ']]
b.rename(columns = {'CompoundName':'Name','DILIst Classification ':'DILI'}, inplace = True)
b

Unnamed: 0,ID,Name,DILI
0,ref2_00001,mercaptopurine,1
1,ref2_00002,acetaminophen,1
2,ref2_00003,azathioprine,1
3,ref2_00004,chlorpheniramine,0
4,ref2_00005,clofibrate,1
...,...,...,...
1274,ref2_01275,Vindesine,1
1275,ref2_01276,Voglibose,1
1276,ref2_01277,Xylometazoline,1
1277,ref2_01278,Zaltoprofen,1


In [8]:
c1 = pd.read_csv('./DILI/data/ko_es.csv')
c1.rename(columns = {'mol_min':'SMILES'}, inplace = True)
c1 = c1.replace('positive', '1').replace('negative','0')

c2 = pd.read_csv('./DILI/data/ko_ts.csv')
c2.rename(columns = {'mol_min':'SMILES','Binary_Characterization':'DILI'}, inplace=True)

c = pd.concat([c1,c2])[['SMILES','DILI']]
c = setID(c,3)[['ID','SMILES','DILI']]
c

Unnamed: 0,ID,SMILES,DILI
0,ref3_00001,ClC(Cl)CCl,1
1,ref3_00002,N(N)(C)C,1
2,ref3_00003,ClC\C=C\Cl,1
3,ref3_00004,Oc1cc2c(cc1)cccc2,1
4,ref3_00005,OC(C#N)(C)C,1
...,...,...,...
806,ref3_00807,s1cccc1CC(=O)N[C@@]1(OC)C2SCC(COC(=O)N)=C(N2C1...,0
807,ref3_00808,s1cccc1CCN1CCC(N(C(=O)CC)c2ccccc2)(CC1)COC,0
808,ref3_00809,s1cccc1C\C(=C/c1n(Cc2ccc(cc2)C(O)=O)c(nc1)CCCC...,0
809,ref3_00810,s1cccc1\C=C\C1=NCCCN1C,0


In [9]:
d = pd.read_excel('./DILI/data/will.xlsx')
d = setID(d, 4)[['ID','Drug','DILI.severity.category']]
d.rename(columns = {'Drug':'Name','DILI.severity.category':'DILI'}, inplace=True)
d

Unnamed: 0,ID,Name,DILI
0,ref4_00001,Albuterol,0
1,ref4_00002,Alendronate,0
2,ref4_00003,Ambrisentan,0
3,ref4_00004,Amiodarone,1
4,ref4_00005,Amodiaquine,1
...,...,...,...
60,ref4_00061,Tolmetin,1
61,ref4_00062,Troglitazone,1
62,ref4_00063,Ximelagatran,1
63,ref4_00064,Zileuton,1


In [10]:
e = pd.read_excel('./DILI/data/kim.xlsx', sheet_name='Sheet2')
e = setID(e, 5)[['ID','COMPOUND_NAME','DILI_CONCERN']]
e.rename(columns= {'COMPOUND_NAME':'Name', 'DILI_CONCERN':'DILI'}, inplace=True)
e

Unnamed: 0,ID,Name,DILI
0,ref5_00001,mercaptopurine,1
1,ref5_00002,acetaminophen,1
2,ref5_00003,azathioprine,1
3,ref5_00004,chlorpheniramine,0
4,ref5_00005,dopamine,0
...,...,...,...
472,ref5_00473,orlistat,1
473,ref5_00474,apomorphine,0
474,ref5_00475,dihydroergotamine,0
475,ref5_00476,procyclidine,0


In [11]:
f = pd.read_excel('./DILI/data/zhang.xls')
f = f.iloc[0:-1]
f = setID(f, 6)[['ID','SMILES','Bioactivity*']]
f.rename(columns={'Bioactivity*':'DILI'},inplace=True)
f[['DILI']]=f[['DILI']].replace(1.0,'1').replace(0.0,'0')
f

Unnamed: 0,ID,SMILES,DILI
0,ref6_00001,[O-][N+](=O)c1cccc(c1)[N+]([O-])=O,1
1,ref6_00002,C[C@]12CC[C@H]3[C@@H](CCc4cc(OS(O)(=O)=O)ccc34...,1
2,ref6_00003,CC(C)CCCC(C)C1CCC2C(CCCC12C)=CC=C1CC(O)CC(O)C1=C,1
3,ref6_00004,CC(=O)Nc1ccc-2c(Cc3ccccc-23)c1,1
4,ref6_00005,OCC1OC(O)CC(O)C1O,1
...,...,...,...
1312,ref6_01313,CC1C(OCCN1C)c1ccccc1,0
1313,ref6_01314,CC(C)(N)Cc1ccccc1,0
1314,ref6_01315,OC(CCN1CCCC1)(C1CCCCC1)c1ccccc1,0
1315,ref6_01316,C[N+](C)(C)CCOC(=O)CCC(=O)OCC[N+](C)(C)C,0


In [12]:
dili = pd.concat([a,b,c,d,e,f])[['ID','Name','SMILES','DILI']]
dili = dili.fillna('none')
dili.to_csv('./DILI/temp/DILI_notProcessed.csv')
dili

Unnamed: 0,ID,Name,SMILES,DILI
0,ref1_00001,none,CNCC[C@@H](Oc1ccccc1C)c2ccccc2,1
1,ref1_00002,none,CN1CCC[C@@H]1CCO[C@](C)(c2ccccc2)c3ccc(Cl)cc3,0
2,ref1_00003,none,CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13,1
3,ref1_00004,none,CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24,1
4,ref1_00005,none,NC(=O)C([C@@H]1CCN(CCc2ccc3OCCc3c2)C1)(c4ccccc...,0
...,...,...,...,...
1312,ref6_01313,none,CC1C(OCCN1C)c1ccccc1,0
1313,ref6_01314,none,CC(C)(N)Cc1ccccc1,0
1314,ref6_01315,none,OC(CCN1CCCC1)(C1CCCCC1)c1ccccc1,0
1315,ref6_01316,none,C[N+](C)(C)CCOC(=O)CCC(=O)OCC[N+](C)(C)C,0


In [13]:
smiles_dili = pd.concat([a,c,f])
smiles_dili.to_csv('./DILI/temp/SMILES_DILI.csv')
smiles_dili

Unnamed: 0,ID,SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c2ccccc2,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c2ccccc2)c3ccc(Cl)cc3,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13,1
3,ref1_00004,CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24,1
4,ref1_00005,NC(=O)C([C@@H]1CCN(CCc2ccc3OCCc3c2)C1)(c4ccccc...,0
...,...,...,...
1312,ref6_01313,CC1C(OCCN1C)c1ccccc1,0
1313,ref6_01314,CC(C)(N)Cc1ccccc1,0
1314,ref6_01315,OC(CCN1CCCC1)(C1CCCCC1)c1ccccc1,0
1315,ref6_01316,C[N+](C)(C)CCOC(=O)CCC(=O)OCC[N+](C)(C)C,0


In [14]:
name_dili = pd.concat([b,d,e])
name_dili.to_csv('./DILI/temp/Name_only_DILI.csv')
name_dili

Unnamed: 0,ID,Name,DILI
0,ref2_00001,mercaptopurine,1
1,ref2_00002,acetaminophen,1
2,ref2_00003,azathioprine,1
3,ref2_00004,chlorpheniramine,0
4,ref2_00005,clofibrate,1
...,...,...,...
472,ref5_00473,orlistat,1
473,ref5_00474,apomorphine,0
474,ref5_00475,dihydroergotamine,0
475,ref5_00476,procyclidine,0


In [15]:
df = pd.read_csv('./DILI/temp/Name_only_DILI.csv')

begin = time.time()
name_list = pd.Series(list(df['Name']))
df['SMILES'] = name_list.map(get_smiles)
end = time.time()

elapsed = end - begin

print('\n')
print(elapsed)

..................................................................................................................................................................................................................................'cid' 	 gemtuzumab ozogamicin 

..............................................................................'cid' 	 ferumoxytol 

............'cid' 	 pancrelipase 

..........'cid' 	 dermatan 

...............'cid' 	 protamine sulfate 

..................................................'cid' 	 nitroprusside 

.....................'cid' 	 pegademase bovine 

.......'cid' 	 dalteparin sodium 

'cid' 	 ferumoxsil 

.'cid' 	 porfimer 

.........................................................................................................................................................'cid' 	 urokinase 

...................................'cid' 	 botulinum toxin type a 

...'cid' 	 interferon alfa-2a, recombinant 

..'cid' 	 alteplase 

...'cid' 	 sargramostim 

..

In [16]:
df = df[['ID','SMILES','DILI']]
df

Unnamed: 0,ID,SMILES,DILI
0,ref2_00001,C1=NC2=C(N1)C(=S)N=CN2,1
1,ref2_00002,CC(=O)NC1=CC=C(C=C1)O,1
2,ref2_00003,CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-],1
3,ref2_00004,CN(C)CCC(C1=CC=C(C=C1)Cl)C2=CC=CC=N2,0
4,ref2_00005,CCOC(=O)C(C)(C)OC1=CC=C(C=C1)Cl,1
...,...,...,...
1816,ref5_00473,CCCCCCCCCCCC(CC1C(C(=O)O1)CCCCCC)OC(=O)C(CC(C)...,1
1817,ref5_00474,CN1CCC2=C3C1CC4=C(C3=CC=C2)C(=C(C=C4)O)O,0
1818,ref5_00475,CC1(C(=O)N2C(C(=O)N3CCCC3C2(O1)O)CC4=CC=CC=C4)...,0
1819,ref5_00476,C1CCC(CC1)C(CCN2CCCC2)(C3=CC=CC=C3)O,0


In [17]:
df.to_csv('./DILI/temp/DILI_name_to_smiles.csv')

In [18]:
a = pd.read_csv('./DILI/temp/SMILES_DILI.csv')
b = pd.read_csv('./DILI/temp/DILI_name_to_smiles.csv')
df = pd.concat([a,b],axis=0)
df

Unnamed: 0.1,Unnamed: 0,ID,SMILES,DILI
0,0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c2ccccc2,1
1,1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c2ccccc2)c3ccc(Cl)cc3,0
2,2,ref1_00003,CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13,1
3,3,ref1_00004,CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24,1
4,4,ref1_00005,NC(=O)C([C@@H]1CCN(CCc2ccc3OCCc3c2)C1)(c4ccccc...,0
...,...,...,...,...
1816,1816,ref5_00473,CCCCCCCCCCCC(CC1C(C(=O)O1)CCCCCC)OC(=O)C(CC(C)...,1
1817,1817,ref5_00474,CN1CCC2=C3C1CC4=C(C3=CC=C2)C(=C(C=C4)O)O,0
1818,1818,ref5_00475,CC1(C(=O)N2C(C(=O)N3CCCC3C2(O1)O)CC4=CC=CC=C4)...,0
1819,1819,ref5_00476,C1CCC(CC1)C(CCN2CCCC2)(C3=CC=CC=C3)O,0


In [19]:
df.to_csv('./DILI/temp/DILI_SMILES_All.csv')

In [20]:
df = pd.read_csv('./DILI/temp/DILI_SMILES_All.csv')
smiles_list = pd.Series(list(df['SMILES']))
df['Canonical_SMILES'] = smiles_list.map(standardize)
df

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,SMILES,DILI,Canonical_SMILES
0,0,0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c2ccccc2,1,CNCC[C@@H](Oc1ccccc1C)c1ccccc1
1,1,1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c2ccccc2)c3ccc(Cl)cc3,0,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1
2,2,2,ref1_00003,CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13,1,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21
3,3,3,ref1_00004,CN1CCN(CC1)C2=Nc3cc(Cl)ccc3Nc4ccccc24,1,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1
4,4,4,ref1_00005,NC(=O)C([C@@H]1CCN(CCc2ccc3OCCc3c2)C1)(c4ccccc...,0,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...
...,...,...,...,...,...,...
4423,1816,1816,ref5_00473,CCCCCCCCCCCC(CC1C(C(=O)O1)CCCCCC)OC(=O)C(CC(C)...,1,CCCCCCCCCCCC(CC1OC(=O)C1CCCCCC)OC(=O)C(CC(C)C)...
4424,1817,1817,ref5_00474,CN1CCC2=C3C1CC4=C(C3=CC=C2)C(=C(C=C4)O)O,0,CN1CCc2cccc3c2C1Cc1ccc(O)c(O)c1-3
4425,1818,1818,ref5_00475,CC1(C(=O)N2C(C(=O)N3CCCC3C2(O1)O)CC4=CC=CC=C4)...,0,CN1CC(C(=O)NC2(C)OC3(O)C4CCCN4C(=O)C(Cc4ccccc4...
4426,1819,1819,ref5_00476,C1CCC(CC1)C(CCN2CCCC2)(C3=CC=CC=C3)O,0,OC(CCN1CCCC1)(c1ccccc1)C1CCCCC1


In [50]:
df = df[['ID','Canonical_SMILES','DILI']]

In [51]:
df.to_csv('./DILI/temp/DILI_Final.csv')

In [52]:
df = pd.read_csv('./DILI/temp/DILI_Final.csv')
grouped = df['DILI'].groupby(df['Canonical_SMILES'])
can_list = grouped.mean()
grouped_df = pd.DataFrame(can_list)
grouped_df

Unnamed: 0_level_0,DILI
Canonical_SMILES,Unnamed: 1_level_1
Br.C=C1N(CCCCCC(=O)O)c2ccccc2C1(C)C,0.0
BrC(Br)Br,1.0
BrCCBr,1.0
Brc1c(NC2=NCCN2)ccc2nccnc12,0.0
Brc1ccccc1,1.0
...,...
c1ccc2c(c1)Nc1ccccc1S2,1.0
c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1.0
c1ccncc1,1.0
c1ccoc1,0.0


In [53]:
grouped_df.rename(columns={'DILI':'DILI_mean'}, inplace = True)
grouped_df.to_csv('./DILI/temp/Grouped.csv')

In [54]:
final = pd.read_csv('./DILI/temp/DILI_Final.csv')
dupl_filter = pd.read_csv('./DILI/temp/Grouped.csv')

In [58]:
final= final[['ID','Canonical_SMILES','DILI']]
final

Unnamed: 0,ID,Canonical_SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0
...,...,...,...
4423,ref5_00473,CCCCCCCCCCCC(CC1OC(=O)C1CCCCCC)OC(=O)C(CC(C)C)...,1
4424,ref5_00474,CN1CCc2cccc3c2C1Cc1ccc(O)c(O)c1-3,0
4425,ref5_00475,CN1CC(C(=O)NC2(C)OC3(O)C4CCCN4C(=O)C(Cc4ccccc4...,0
4426,ref5_00476,OC(CCN1CCCC1)(c1ccccc1)C1CCCCC1,0


In [59]:
dupl_filter

Unnamed: 0,Canonical_SMILES,DILI_mean
0,Br.C=C1N(CCCCCC(=O)O)c2ccccc2C1(C)C,0.0
1,BrC(Br)Br,1.0
2,BrCCBr,1.0
3,Brc1c(NC2=NCCN2)ccc2nccnc12,0.0
4,Brc1ccccc1,1.0
...,...,...
2382,c1ccc2c(c1)Nc1ccccc1S2,1.0
2383,c1ccc2c(c1)Sc1ccccc1N2CC1CN2CCC1CC2,1.0
2384,c1ccncc1,1.0
2385,c1ccoc1,0.0


In [86]:
dili_final = pd.merge(final,dupl_filter).drop_duplicates(['Canonical_SMILES']).sort_values(by=['ID'],axis=0)
dili_final

Unnamed: 0,ID,Canonical_SMILES,DILI,DILI_mean
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1,1.0
2,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0,0.0
4,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1,1.0
8,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1,1.0
13,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0,0.0
...,...,...,...,...
3844,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0,0.0
3845,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0,0.0
3846,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0,0.0
3847,ref6_01304,NC(N)=NCCN1CCCCCCC1,0,0.0


In [87]:
dili_final.to_csv('./DILI/temp/DILI_grouped.csv')

In [89]:
df = pd.read_csv('./DILI/final/DILI_Final.csv')
df

Unnamed: 0,ID,Canonical_SMILES,DILI
0,ref1_00001,CNCC[C@@H](Oc1ccccc1C)c1ccccc1,1
1,ref1_00002,CN1CCC[C@@H]1CCO[C@](C)(c1ccccc1)c1ccc(Cl)cc1,0
2,ref1_00003,CN(C)CCCN1c2ccccc2CCc2ccc(Cl)cc21,1
3,ref1_00004,CN1CCN(C2=Nc3cc(Cl)ccc3Nc3ccccc32)CC1,1
4,ref1_00005,NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3...,0
...,...,...,...
2256,ref6_01298,CN1C[C@H](C(=O)N[C@]2(C)O[C@@]3(O)[C@@H]4CCCN4...,0
2257,ref6_01300,CNC[C@H](O)c1ccc(O)c(O)c1,0
2258,ref6_01301,CCCCCCCC/C=C/CCCCCCCC(=O)O,0
2259,ref6_01304,NC(N)=NCCN1CCCCCCC1,0


----

## Acute Oral Toxicity

In [96]:
training = pd.read_csv('./Acute Oral Toxicity/data/trainingset_171130.tsv', sep='\t')
test = pd.read_csv('./Acute Oral Toxicity/data/validationset.tsv', sep ='\t')
df = pd.concat([training,test])
df = setID(df,1)[['ID','Canonical_QSARr','very_toxic','nontoxic','LD50_mgkg','EPA_category','GHS_category']]
df

Unnamed: 0,ID,Canonical_QSARr,very_toxic,nontoxic,LD50_mgkg,EPA_category,GHS_category
0,ref1_00001,[O-][N+](=O)C1C=CC(Cl)=CC=1,False,False,460.0,2.0,4.0
1,ref1_00002,NC1=CC=C(C=C1)[N+]([O-])=O,False,False,750.0,3.0,4.0
2,ref1_00003,[O-][N+](=O)C1C=CC(O)=CC=1,False,False,170.0,2.0,3.0
3,ref1_00004,[O-][N+](=O)C1C=CC(CCl)=CC=1,False,False,1809.0,3.0,4.0
4,ref1_00005,CNC1C=CC(=CC=1)[N+]([O-])=O,False,True,,3.0,5.0
...,...,...,...,...,...,...,...
11884,ref1_11885,C1C=CC=CC=1,False,True,3323.0,3.0,5.0
11885,ref1_11886,COP(=O)(OC=C(Cl)Cl)OC,True,False,30.0,1.0,2.0
11886,ref1_11887,CC1=CC(=NC(=N1)C(C)C)OP(=S)(OCC)OCC,False,False,92.0,2.0,3.0
11887,ref1_11888,CC(CCl)OP(=O)(OC(C)CCl)OC(C)CCl,False,False,980.0,3.0,4.0


In [97]:
df.to_csv('./Acute Oral Toxicity/temp/AOT_tr_val.csv')

In [98]:
df = pd.read_csv('./Acute Oral Toxicity/temp/AOT_tr_val.csv')

begin = time.time()
smiles_list = pd.Series(list(df['Canonical_QSARr']))
df['Canonical_SMILES'] = smiles_list.map(standardize)
end = time.time()

elapsed = end - begin

print('\n',elapsed,'\n')

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [101]:
df = df[['ID','Canonical_SMILES','very_toxic','nontoxic','LD50_mgkg','EPA_category','GHS_category']]
df

Unnamed: 0,ID,Canonical_SMILES,very_toxic,nontoxic,LD50_mgkg,EPA_category,GHS_category
0,ref1_00001,O=[N+]([O-])c1ccc(Cl)cc1,False,False,460.0,2.0,4.0
1,ref1_00002,Nc1ccc([N+](=O)[O-])cc1,False,False,750.0,3.0,4.0
2,ref1_00003,O=[N+]([O-])c1ccc(O)cc1,False,False,170.0,2.0,3.0
3,ref1_00004,O=[N+]([O-])c1ccc(CCl)cc1,False,False,1809.0,3.0,4.0
4,ref1_00005,CNc1ccc([N+](=O)[O-])cc1,False,True,,3.0,5.0
...,...,...,...,...,...,...,...
11884,ref1_11885,c1ccccc1,False,True,3323.0,3.0,5.0
11885,ref1_11886,COP(=O)(OC)OC=C(Cl)Cl,True,False,30.0,1.0,2.0
11886,ref1_11887,CCOP(=S)(OCC)Oc1cc(C)nc(C(C)C)n1,False,False,92.0,2.0,3.0
11887,ref1_11888,CC(CCl)OP(=O)(OC(C)CCl)OC(C)CCl,False,False,980.0,3.0,4.0


In [105]:
df.to_csv('./Acute Oral Toxicity/temp/AOT_Canonicalize.csv')

In [107]:
df = pd.read_csv('./Acute Oral Toxicity/temp/AOT_Final_Add_Class.csv')
grouped = df['Acute Oral Toxicity'].groupby(df['Canonical_SMILES'])
can_list = grouped.mean()
grouped_df = pd.DataFrame(can_list)
grouped_df

Unnamed: 0_level_0,Acute Oral Toxicity
Canonical_SMILES,Unnamed: 1_level_1
B12B3B4B1C234,0.0
BrC(Br)(Br)c1ccc2ccccc2n1,0.0
BrC12CC3CC(CC(C3)C1)C2,0.0
BrC1CC(Br)CC(Br)CC(Br)CC(Br)CC(Br)C1,0.0
BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,0.0
...,...
c1ccccc1,0.0
c1cncc(CSSCc2cccnc2)c1,0.0
c1cnn(-c2cc(-n3cccn3)ncn2)c1,0.0
c1coc(CNc2[nH]cnc3ncnc2-3)c1,0.0


In [108]:
grouped_df.to_csv('./Acute Oral Toxicity/temp/Grouped.csv')

In [109]:
final = pd.read_csv('./Acute Oral Toxicity/temp/AOT_Final_Add_Class.csv')
dupl_filter = pd.read_csv('./Acute Oral Toxicity/temp/Grouped.csv')

In [110]:
final

Unnamed: 0,ID,Canonical_SMILES,Acute Oral Toxicity
0,ref1_00539,Sc1ccccc1,1
1,ref1_11490,Sc1c(Cl)c(Cl)c(Cl)c(Cl)c1Cl,0
2,ref1_06096,S=P(Oc1ccccc1)(Oc1ccccc1)Oc1ccccc1,0
3,ref1_05267,S=P(N1CC1)(N1CC1)N1CC1,1
4,ref1_02880,S=C1NCNCN1,1
...,...,...,...
6062,ref1_07063,[H]/N=C(\N)N(C)CCOP(=O)(O)O,0
6063,ref1_08042,[H]/N=C(\N)c1ccc2cc(OC(=O)c3ccc(N/C(N)=N/[H])c...,0
6064,ref1_04678,[H]/N=C(\C)NP(=S)(Oc1ccc(Cl)cc1)Oc1ccc(Cl)cc1,1
6065,ref1_02579,[H]/N=C(/NC1CC(N)C(OC2C(O)C(O)C(O)C(O)C2O)OC1C...,0


In [111]:
dupl_filter

Unnamed: 0,Canonical_SMILES,Acute Oral Toxicity
0,B12B3B4B1C234,0.0
1,BrC(Br)(Br)c1ccc2ccccc2n1,0.0
2,BrC12CC3CC(CC(C3)C1)C2,0.0
3,BrC1CC(Br)CC(Br)CC(Br)CC(Br)CC(Br)C1,0.0
4,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,0.0
...,...,...
5807,c1ccccc1,0.0
5808,c1cncc(CSSCc2cccnc2)c1,0.0
5809,c1cnn(-c2cc(-n3cccn3)ncn2)c1,0.0
5810,c1coc(CNc2[nH]cnc3ncnc2-3)c1,0.0


In [112]:
aot_final = pd.merge(final,dupl_filter).drop_duplicates(['Canonical_SMILES']).sort_values(by=['ID'],axis=0)
aot_final

Unnamed: 0,ID,Canonical_SMILES,Acute Oral Toxicity
2085,ref1_00005,CNc1ccc([N+](=O)[O-])cc1,0
1775,ref1_00006,COc1ccc([N+](=O)[O-])cc1,0
5076,ref1_00007,CC(C)c1ccc(C(C)C)cc1,0
832,ref1_00008,O=C(Cl)c1ccc(C(=O)Cl)cc1,0
2780,ref1_00011,CCOc1ccc([N+](=O)[O-])cc1,0
...,...,...,...
5058,ref1_11878,CC(C)CC(=O)CC(C)C,0
2143,ref1_11879,CNC(=O)Oc1cccc2c1OC(C)(C)C2,1
5595,ref1_11885,c1ccccc1,0
1542,ref1_11886,COP(=O)(OC)OC=C(Cl)Cl,1


In [113]:
aot_final.to_csv('./Acute Oral Toxicity/temp/AOT_Grouped.csv')

In [114]:
df = pd.read_csv('./Acute Oral Toxicity/final/AOT_Final.csv')

In [115]:
df

Unnamed: 0.1,Unnamed: 0,ID,Canonical_SMILES,Acute Oral Toxicity
0,2085,ref1_00005,CNc1ccc([N+](=O)[O-])cc1,0
1,1775,ref1_00006,COc1ccc([N+](=O)[O-])cc1,0
2,5076,ref1_00007,CC(C)c1ccc(C(C)C)cc1,0
3,832,ref1_00008,O=C(Cl)c1ccc(C(=O)Cl)cc1,0
4,2780,ref1_00011,CCOc1ccc([N+](=O)[O-])cc1,0
...,...,...,...,...
5807,5058,ref1_11878,CC(C)CC(=O)CC(C)C,0
5808,2143,ref1_11879,CNC(=O)Oc1cccc2c1OC(C)(C)C2,1
5809,5595,ref1_11885,c1ccccc1,0
5810,1542,ref1_11886,COP(=O)(OC)OC=C(Cl)Cl,1


-----

## AMES Mutagenesis

In [117]:
a = pd.read_csv('./AMES Mutagenesis/Hansen et al 2009/ci900161g_si_001/smiles_cas_N6512.csv', sep='\t',names=['SMILES', 'CAS','Mutagenicity'])
df = setID(a,1)[['ID','SMILES','Mutagenicity']]
df

Unnamed: 0,ID,SMILES,Mutagenicity
0,ref1_00001,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,0
1,ref1_00002,NNC(=O)CNC(=O)\C=N\#N,1
2,ref1_00003,O=C1NC(=O)\C(=N/#N)\C=N1,1
3,ref1_00004,NC(=O)CNC(=O)\C=N\#N,1
4,ref1_00005,CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O,1
...,...,...,...
6507,ref1_06508,COC1COC(COCC2COC(OC)C(OS(=O)(=O)O)C2OS(=O)(=O)...,0
6508,ref1_06509,OC(Cc1cn(N=O)c2ccccc12)C(=O)O,1
6509,ref1_06510,COC(=O)Nc1nc2ccccc2[nH]1,1
6510,ref1_06511,ClCc1cccc2c3cccc4cccc(c12)c34,1


In [118]:
df.to_csv('./AMES Mutagenesis/data/AMES_ref1.csv')

In [119]:
b = pd.read_excel("./AMES Mutagenesis/MutagenPred-GCNN. 2021/12539_2020_407_MOESM1_ESM.xlsx", sheet_name = 'Sheet1')
df = setID(b,2)[['ID','SMILES','Mutagenicity']]
df

Unnamed: 0,ID,SMILES,Mutagenicity
0,ref2_00001,c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45,1
1,ref2_00002,C1O[C@@H]1[C@H]2CO2,1
2,ref2_00003,Nc1ccc(cc1)c2ccc(N)cc2,1
3,ref2_00004,CCCCC(CC)COC(=O)c1ccccc1C(=O)OCC(CC)CCCC,0
4,ref2_00005,Nc1ccc2cc3ccccc3cc2c1,1
...,...,...,...
1377,ref2_01378,CC1=NC2=C(C3=C(C=C2N=C1)N=C(N3C)C)C,0
1378,ref2_01379,C1=CC2=CC(=C(C(=C2N=C1)F)F)F,0
1379,ref2_01380,C1=CC2=CC(=C(C=C2N=C1)F)F,0
1380,ref2_01381,C1=CC(=CC=C1CC(C(=O)O)N)Cl,0


In [120]:
df.to_csv('./AMES Mutagenesis/data/AMES_ref2.csv')

In [121]:
c = pd.read_excel('./AMES Mutagenesis/AMES, QSAR International Collaborative Study/ClassATotal.xlsx')
df = setID(c,3)[['ID','SMILES','Mutagenicity']]
df

Unnamed: 0,ID,SMILES,Mutagenicity
0,ref3_00001,CC(=O)NC1=C(C=CC(=C1)N(C COC(=O)C)CCOC(=O)C)N=...,1
1,ref3_00002,C1=CC=C(C=C1)CCN(CCC#N) C2=CC=C(C=C2)N=NC3=C(C...,1
2,ref3_00003,[O-\n][N+](=O)c2ccc(/N=N/c1ccc(N( CCOC(=O)C)CC...,1
3,ref3_00004,COC(=O)C(CCl)Cl,1
4,ref3_00005,COC(=O)C(CSC(=N)N)Cl.Cl,1
...,...,...,...
667,ref3_00668,[O-][N+](=O)C1=CC=C(C=C1)S(=N\C(=O)C(F)(F)F)\C...,1
668,ref3_00669,FC(F)(COC(=O)CCCl)C(F)(F)OC(F)(F)C(C(F)(F)OC(F...,1
669,ref3_00670,FC1=CC(=CC=C1CBr)C1=C(C= CC=C1)C#N,1
670,ref3_00671,NC1=C(C#N)C(=CS1)C1=CC=C(N)C=C1,1


In [122]:
df.to_csv('./AMES Mutagenesis/data/AMES_ref3.csv')

In [123]:
d_ts = pd.read_excel('./AMES Mutagenesis/Xu. 2012/2455387/ci300400a_si_001.xls', sheet_name = 'training set')
d_es = pd.read_excel('./AMES Mutagenesis/Xu. 2012/2455387/ci300400a_si_001.xls', sheet_name = 'external validation set')
d = pd.concat([d_ts,d_es])
df = setID(d,4)[['ID','SMILES','Mutagenicity']]
df

Unnamed: 0,ID,SMILES,Mutagenicity
0,ref4_00001,[O-][N+](=O)c1ccc2ccc3ccc(c4c5ccccc5c1c2c34)[N...,1
1,ref4_00002,[O-][N+](=O)c1c2CCCCc2c3ccc4cccc5ccc1c3c45,1
2,ref4_00003,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,0
3,ref4_00004,NNC(=O)CNC(=O)C=N#N,1
4,ref4_00005,O=C1NC(=O)C(C=N1)=N#N,1
...,...,...,...
8343,ref4_08344,O=N(=O)OC1COC2C(COC12)ON(=O)=O,0
8344,ref4_08345,OCC1OC(C(O)C1O)n2cnc3c(NCc4ccc(cc4)N(=O)=O)ncnc23,0
8345,ref4_08346,CNN(=O)=O,0
8346,ref4_08347,NC(Cc1ccc(O)c(c1)N(=O)=O)C(=O)O,0


In [124]:
df.to_csv('./AMES Mutagenesis/data/AMES_ref4.csv')

----

### Data Cocatenation

In [125]:
a = pd.read_csv('./AMES Mutagenesis/data/AMES_ref1.csv')
b = pd.read_csv('./AMES Mutagenesis/data/AMES_ref2.csv')
c = pd.read_csv('./AMES Mutagenesis/data/AMES_ref3.csv')
d = pd.read_csv('./AMES Mutagenesis/data/AMES_ref4.csv')

In [126]:
ames = pd.concat([a,b,c,d])[['ID','SMILES','Mutagenicity']]
ames

Unnamed: 0,ID,SMILES,Mutagenicity
0,ref1_00001,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,0
1,ref1_00002,NNC(=O)CNC(=O)\C=N\#N,1
2,ref1_00003,O=C1NC(=O)\C(=N/#N)\C=N1,1
3,ref1_00004,NC(=O)CNC(=O)\C=N\#N,1
4,ref1_00005,CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O,1
...,...,...,...
8343,ref4_08344,O=N(=O)OC1COC2C(COC12)ON(=O)=O,0
8344,ref4_08345,OCC1OC(C(O)C1O)n2cnc3c(NCc4ccc(cc4)N(=O)=O)ncnc23,0
8345,ref4_08346,CNN(=O)=O,0
8346,ref4_08347,NC(Cc1ccc(O)c(c1)N(=O)=O)C(=O)O,0


In [127]:
ames.to_csv('./AMES Mutagenesis/temp/AMES_total.csv')

In [128]:
df = pd.read_csv('./AMES Mutagenesis/temp/AMES_total.csv')

begin = time.time()
smiles_list = pd.Series(list(df['SMILES']))
df['Canonical_SMILES'] = smiles_list.map(standardize)
end = time.time()

elapsed = end - begin

print(elapsed)

.SMILES Parse Error: syntax error for input: NNC(=O)CNC(=O)\C=N\#N  	 NNC(=O)CNC(=O)\C=N\#N  

SMILES Parse Error: syntax error for input: O=C1NC(=O)\C(=N/#N)\C=N1  	 O=C1NC(=O)\C(=N/#N)\C=N1  

SMILES Parse Error: syntax error for input: NC(=O)CNC(=O)\C=N\#N  	 NC(=O)CNC(=O)\C=N\#N  

SMILES Parse Error: syntax error for input: CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O  	 CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O  

SMILES Parse Error: syntax error for input: NC(COC(=O)\C=N/#N)C(=O)O  	 NC(COC(=O)\C=N/#N)C(=O)O  

SMILES Parse Error: syntax error for input: CCN(CC(O)C1=CC(=O)\C(=N\#N)\C=C1)N=O  	 CCN(CC(O)C1=CC(=O)\C(=N\#N)\C=C1)N=O  



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

....SMILES Parse Error: syntax error for input: C1=CC(=C(C=C1OC2=CC(=NC
=C2)C(=O)N)F)N 	 C1=CC(=C(C=C1OC2=CC(=NC
=C2)C(=O)N)F)N 

.SMILES Parse Error: syntax error for input: CC1(C=CC2=C(O1)C=CC(=C2) [N+](=O)[O-])C 	 CC1(C=CC2=C(O1)C=CC(=C2) [N+](=O)[O-])C 

.SMILES Parse Error: syntax error for input: C(=O)(C(Cl)(Cl)Cl)OC(=O)C(Cl
)(Cl)Cl 	 C(=O)(C(Cl)(Cl)Cl)OC(=O)C(Cl
)(Cl)Cl 

SMILES Parse Error: syntax error for input: C1=C[N+](=CC=C1[N+](=O)[O
-])[O-] 	 C1=C[N+](=CC=C1[N+](=O)[O
-])[O-] 

..SMILES Parse Error: syntax error for input: C1=C(N=CN1[N+](=O)[O-
])[N+](=O)[O-] 	 C1=C(N=CN1[N+](=O)[O-
])[N+](=O)[O-] 

.SMILES Parse Error: syntax error for input: CCOC(=O)[C@H](CC1=CC=C( C=C1)[N+](=O)[O-])N.Cl 	 CCOC(=O)[C@H](CC1=CC=C( C=C1)[N+](=O)[O-])N.Cl 

.SMILES Parse Error: syntax error for input: C[C@H](C1=CC=CC=C1)NCC(
=O)C2=CC=C(C=C2)Br 	 C[C@H](C1=CC=CC=C1)NCC(
=O)C2=CC=C(C=C2)Br 

..SMILES Parse Error: syntax error for input: [O-
][N+](=O)\C=C\C1=CC=C(Cl)C
=C1 	 [O-
][N+](=O)\C=C

...............SMILES Parse Error: syntax error for input: CN1C(C)=C(C)C2=C1C=CC1=C 2C=CC=C1 	 CN1C(C)=C(C)C2=C1C=CC1=C 2C=CC=C1 

SMILES Parse Error: syntax error for input: [Cl-].[Cl-].OC1=C(C=C(C=C1)[N+]1=CC=C(C=C1)C1=CC=[N+](C=C1)C 1=CC(=C(O)C=C1)C1=CC=CC= C1)C1=CC=CC=C1 	 [Cl-].[Cl-].OC1=C(C=C(C=C1)[N+]1=CC=C(C=C1)C1=CC=[N+](C=C1)C 1=CC(=C(O)C=C1)C1=CC=CC= C1)C1=CC=CC=C1 

..SMILES Parse Error: syntax error for input: CCC1=CC=C(C=C1)C#CC1=CC (F)=C(F)C=C1 	 CCC1=CC=C(C=C1)C#CC1=CC (F)=C(F)C=C1 

.SMILES Parse Error: syntax error for input: CC(C)[Si](OC(=O)N1C2=C(SC3=C1C=CC(=C3)N(C)C)C=C(C= C2)N(C)C)(C(C)C)C(C)C 	 CC(C)[Si](OC(=O)N1C2=C(SC3=C1C=CC(=C3)N(C)C)C=C(C= C2)N(C)C)(C(C)C)C(C)C 

........SMILES Parse Error: syntax error for input: CC(=C)C(=O)OC1C2OC3OC(C) (C)OC3C2OC1=O 	 CC(=C)C(=O)OC1C2OC3OC(C) (C)OC3C2OC1=O 

.SMILES Parse Error: syntax error for input: CC(C)C1=CC=C(\C=C\C(Cl)=O) C=C1 	 CC(C)C1=CC=C(\C=C\C(Cl)=O) C=C1 

SMILES Parse Error: syntax error for input: CCN1C2=C(

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Explicit valence for atom # 25 N, 4, is greater than permitted 	 CCN(Cc1cccc(c1)S(=O)(=O)O)c2ccc(cc2)C(=C3C=CC(=N(CC)Cc4cccc(c4)S(=O)(=O)O)C=C3)c5ccc(cc5)S(=O)(=O)O 

.....Explicit valence for atom # 8 N, 4, is greater than permitted 	 Nc1ccc(N)c(c1)N(=O)O 

Explicit valence for atom # 14 N, 4, is greater than permitted 	 CC(=O)Nc1nc(cs1)c2ccc(o2)N(=O)O 

Explicit valence for atom # 1 N, 4, is greater than permitted 	 ON(=O)c1ccc(o1)c2cscn2 

...Explicit valence for atom # 14 N, 4, is greater than permitted 	 CN(C)c1ccc(cc1)C(=C2C=CC(=N(C)C)C=C2)c3ccc(cc3)N(C)C 

Explicit valence for atom # 5 N, 4, is greater than permitted 	 Cc1ncc(N(=O)O)n1C 

..Explicit valence for atom # 16 N, 4, is greater than permitted 	 CN(C)CN=c1[nH]nc(C=Cc2ccc(o2)N(=O)O)o1 

Explicit valence for atom # 15 N, 4, is greater than permitted 	 CCOP(=S)(OCC)Oc1ccc(cc1)N(=O)O 

Explicit valence for atom # 17 N, 4, is greater than permitted 	 OCC(NC(=O)C(Cl)Cl)C(O)c1ccc(cc1)N(=O)O 

....Explicit valence for atom # 1 


Explicit valence for atom # 11 N, 4, is greater than permitted 	 Cc1nc(cs1)c1ccc(o1)[N](=O)O 

Explicit valence for atom # 13 N, 4, is greater than permitted 	 O=CNc1nc(/C=C/c2ccc(o2)[N](=O)O)cs1 

Explicit valence for atom # 16 N, 4, is greater than permitted 	 C(=C(\C(=O)N)/c1ccccc1)/c1ccc(o1)[N](=O)O 

.Explicit valence for atom # 1 N, 4, is greater than permitted 	 O[N](=O)c1ccc2[nH]ncc2c1 

Explicit valence for atom # 4 N, 4, is greater than permitted 	 ClCCC[N](Cl)(C)C 

..Explicit valence for atom # 11 N, 4, is greater than permitted 	 O=C1OC(=O)c2ccc(cc12)[N](=O)O 

.Explicit valence for atom # 0 N, 4, is greater than permitted 	 [N](=O)(O)c1c(O)ccc2ccccc12 

Explicit valence for atom # 7 N, 4, is greater than permitted 	 O=C1N(N=C(C)C1[N](=O)O)c1ccc(cc1)[N](=O)O 

Explicit valence for atom # 0 N, 4, is greater than permitted 	 [N](=O)(O)c1cnc(C)n1C 

Explicit valence for atom # 9 N, 4, is greater than permitted 	 C(=O)(O)c1ccccc1[N](=O)O 

Explicit valence for atom # 8 N, 4, 

.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................Explicit valence for atom # 8 N, 5, is greater than permitted 	 CN1C[N]C2=C1N=C=N(=C2N)O 

.....................................................................................................Explicit valence for atom # 4 N, 5, is greater than permitted 	 CC(O)CN(=[NH+]C(=O)C(=C)C)(C)C 

...................................................................................................................19.784518003463745


In [130]:
df = df[['ID','Canonical_SMILES','Mutagenicity']].dropna()
df

Unnamed: 0,ID,Canonical_SMILES,Mutagenicity
0,ref1_00001,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
7,ref1_00008,CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)OCC4(C)...,0
8,ref1_00009,Nc1nc(N)nc(N)n1,0
9,ref1_00010,Cc1ccc(N=Nc2c(O)ccc3ccccc23)c([N+](=O)[O-])c1,1
10,ref1_00011,CC(C)CC(=O)Nc1snc2ccccc12,0
...,...,...,...
16909,ref4_08344,O=[N+]([O-])OC1COC2C(O[N+](=O)[O-])COC12,0
16910,ref4_08345,O=[N+]([O-])c1ccc(CNc2ncnc3c2ncn3C2OC(CO)C(O)C...,0
16911,ref4_08346,CN[N+](=O)[O-],0
16912,ref4_08347,NC(Cc1ccc(O)c([N+](=O)[O-])c1)C(=O)O,0


In [131]:
df.to_csv('./AMES Mutagenesis/temp/AMES_Canonicalized.csv')

In [139]:
df = pd.read_csv('./AMES Mutagenesis/temp/AMES_Canonicalized.csv')
grouped = df['Mutagenicity'].groupby(df['Canonical_SMILES'])
can_list = grouped.mean()
grouped_ames = pd.DataFrame(can_list)
grouped_ames

Unnamed: 0_level_0,Mutagenicity
Canonical_SMILES,Unnamed: 1_level_1
Br.BrCCNCCBr,1.0
Br/C=C/Br,1.0
BrC(Br)Br,1.0
BrC(Br)C(Br)(Br)Br,0.0
BrC(Br)C(Br)Br,0.0
...,...
c1cnccn1,0.0
c1csc(-c2nc(N3CCOCC3)c3ccccc3n2)c1,1.0
c1cscn1,1.0
c1scc2c1-c1cscc1C1NC21,1.0


In [140]:
grouped_ames.rename(columns={'Mutagenicity':'Mutagenicity_mean'}, inplace = True)
grouped_ames.to_csv('./AMES Mutagenesis/temp/Grouped.csv')

In [141]:
final = pd.read_csv('./AMES Mutagenesis/temp/AMES_Canonicalized.csv')
dupl_filter = pd.read_csv('./AMES Mutagenesis/temp/Grouped.csv')

In [142]:
final

Unnamed: 0.1,Unnamed: 0,ID,Canonical_SMILES,Mutagenicity
0,0,ref1_00001,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
1,7,ref1_00008,CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)OCC4(C)...,0
2,8,ref1_00009,Nc1nc(N)nc(N)n1,0
3,9,ref1_00010,Cc1ccc(N=Nc2c(O)ccc3ccccc23)c([N+](=O)[O-])c1,1
4,10,ref1_00011,CC(C)CC(=O)Nc1snc2ccccc12,0
...,...,...,...,...
16249,16909,ref4_08344,O=[N+]([O-])OC1COC2C(O[N+](=O)[O-])COC12,0
16250,16910,ref4_08345,O=[N+]([O-])c1ccc(CNc2ncnc3c2ncn3C2OC(CO)C(O)C...,0
16251,16911,ref4_08346,CN[N+](=O)[O-],0
16252,16912,ref4_08347,NC(Cc1ccc(O)c([N+](=O)[O-])c1)C(=O)O,0


In [143]:
dupl_filter

Unnamed: 0,Canonical_SMILES,Mutagenicity_mean
0,Br.BrCCNCCBr,1.0
1,Br/C=C/Br,1.0
2,BrC(Br)Br,1.0
3,BrC(Br)C(Br)(Br)Br,0.0
4,BrC(Br)C(Br)Br,0.0
...,...,...
8659,c1cnccn1,0.0
8660,c1csc(-c2nc(N3CCOCC3)c3ccccc3n2)c1,1.0
8661,c1cscn1,1.0
8662,c1scc2c1-c1cscc1C1NC21,1.0


In [144]:
ames_final = pd.merge(final,dupl_filter).drop_duplicates(['Canonical_SMILES']).sort_values(by=['ID'],axis=0)
ames_final

Unnamed: 0.1,Unnamed: 0,ID,Canonical_SMILES,Mutagenicity,Mutagenicity_mean
0,0,ref1_00001,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0,0.0
2,7,ref1_00008,CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)OCC4(C)...,0,0.0
4,8,ref1_00009,Nc1nc(N)nc(N)n1,0,0.0
6,9,ref1_00010,Cc1ccc(N=Nc2c(O)ccc3ccccc23)c([N+](=O)[O-])c1,1,1.0
9,10,ref1_00011,CC(C)CC(=O)Nc1snc2ccccc12,0,0.0
...,...,...,...,...,...
16249,16871,ref4_08306,O=C(O)CCC(=O)OCC(N=C(O)C(Cl)Cl)C(O)c1ccc([N+](...,0,0.0
16250,16874,ref4_08309,O=[N+]([O-])c1ccc(CNc2nc[nH]c3ncnc2-3)cc1,0,0.0
16251,16893,ref4_08328,O=[N+]([O-])c1ccc(-[n+]2nc(-c3ccccc3)nn2-c2ccc...,0,0.0
16252,16895,ref4_08330,O=C(C=Cc1cccc([N+](=O)[O-])c1)c1ccccc1,0,0.0


In [145]:
ames_final.to_csv('./AMES Mutagenesis/temp/AMES_filtered.csv')

In [146]:
df = pd.read_csv('./AMES Mutagenesis/final/AMES_Final.csv')
df

Unnamed: 0,ID,Canonical_SMILES,Mutagenicity
0,ref1_00001,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
1,ref1_00008,CC(=O)OC1(C(C)=O)CCC2C3C=C(Cl)C4=CC(=O)OCC4(C)...,0
2,ref1_00009,Nc1nc(N)nc(N)n1,0
3,ref1_00010,Cc1ccc(N=Nc2c(O)ccc3ccccc23)c([N+](=O)[O-])c1,1
4,ref1_00011,CC(C)CC(=O)Nc1snc2ccccc12,0
...,...,...,...
8629,ref4_08306,O=C(O)CCC(=O)OCC(N=C(O)C(Cl)Cl)C(O)c1ccc([N+](...,0
8630,ref4_08309,O=[N+]([O-])c1ccc(CNc2nc[nH]c3ncnc2-3)cc1,0
8631,ref4_08328,O=[N+]([O-])c1ccc(-[n+]2nc(-c3ccccc3)nn2-c2ccc...,0
8632,ref4_08330,O=C(C=Cc1cccc([N+](=O)[O-])c1)c1ccccc1,0


-----

## hERG

In [147]:
bdb = pd.read_csv('./hERG/hERG_BindingDB.csv')
bdb = setID(bdb, 1)[['ID','st_smiles','label']]
chembl = pd.read_csv('./hERG/hERG_ChEMBL.csv')
chembl = setID(chembl,2)[['ID','st_smiles','label']]
pub = pd.read_csv('./hERG/pubchem.csv')
pub = setID(pub,3)[['ID','st_smiles','label']]
cai = pd.read_csv('./hERG/cai.csv')
cai = setID(cai,4)[['ID','st_smiles','label']]
herg = pd.concat([bdb, chembl, pub, cai])
herg.rename(columns={'st_smiles':'SMILES','label':'hERG'}, inplace = True)
herg

Unnamed: 0,ID,SMILES,hERG
0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1.0
1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1.0
2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1.0
3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1.0
4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1.0
...,...,...,...
10417,ref4_10418,NC1=N[C@]2(CO1)c1cc(OCC3(C)COC3)ccc1Oc1c2cc(cc...,0.0
10418,ref4_10419,O=C1C=CC2(C(N1C)CCC1C2CCC2(C1CC(=Cc1cnc(nc1)C)...,0.0
10419,ref4_10420,O=C1CCCN1C1CCN(CC1)Cc1ccc(cc1)Oc1nc2c(s1)cccc2,0.0
10420,ref4_10421,OC(C(c1ccccc1)N1CCC1)(c1cccnc1)c1cccnc1,0.0


In [148]:
herg.to_csv('./hERG/data/hERG_Origin.csv')

In [149]:
df = pd.read_csv('./hERG/data/hERG_Origin.csv')

begin = time.time()
smiles_list = pd.Series(list(df['SMILES']))
df['Canonical_SMILES'] = smiles_list.map(standardize)
end = time.time()

elapsed = end - begin

print('\n',elapsed,'\n')

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [150]:
df = df[['ID','Canonical_SMILES','hERG']]
df

Unnamed: 0,ID,Canonical_SMILES,hERG
0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1.0
1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1.0
2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1.0
3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1.0
4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1.0
...,...,...,...
21162,ref4_10418,CC1(COc2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cncnc...,0.0
21163,ref4_10419,Cc1ncc(C=C2CC3C4CCC5N(C)C(=O)C=CC5(C)C4CCC3(C)...,0.0
21164,ref4_10420,O=C1CCCN1C1CCN(Cc2ccc(Oc3nc4ccccc4s3)cc2)CC1,0.0
21165,ref4_10421,OC(c1cccnc1)(c1cccnc1)C(c1ccccc1)N1CCC1,0.0


In [151]:
df = df.dropna(subset=['Canonical_SMILES'])
df

Unnamed: 0,ID,Canonical_SMILES,hERG
0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1.0
1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1.0
2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1.0
3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1.0
4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1.0
...,...,...,...
21162,ref4_10418,CC1(COc2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cncnc...,0.0
21163,ref4_10419,Cc1ncc(C=C2CC3C4CCC5N(C)C(=O)C=CC5(C)C4CCC3(C)...,0.0
21164,ref4_10420,O=C1CCCN1C1CCN(Cc2ccc(Oc3nc4ccccc4s3)cc2)CC1,0.0
21165,ref4_10421,OC(c1cccnc1)(c1cccnc1)C(c1ccccc1)N1CCC1,0.0


In [152]:
df.to_csv('./hERG/temp/hERG_Canonicalized.csv')

In [153]:
grouped = df['hERG'].groupby(df['Canonical_SMILES'])
can_list = grouped.mean()
grouped_herg = pd.DataFrame(can_list)
grouped_herg

Unnamed: 0_level_0,hERG
Canonical_SMILES,Unnamed: 1_level_1
Brc1ccc(-n2ccc3ccncc32)cc1,1.0
Brc1ccc(Nc2ccc(CN3CCC4(CC3)OCCc3sccc34)cc2)cc1,1.0
Brc1ccc2[nH]c3c(c2c1)CCCC3Nc1ccccc1,0.0
Brc1ccc2c(NC3=NCC4(CN5CCC4CC5)O3)ncnn12,1.0
Brc1ccc2c(NC3=NC[C@@]4(CN5CCC4CC5)O3)ncnn12,1.0
...,...
c1coc(-c2ccc(OCCCN3CCCCC3)cc2)c1,1.0
c1csc(-c2ccc(OCCCN3CCCCC3)cc2)c1,1.0
c1ncc(-c2cc3sc(N4CCC(N5CCCCC5)CC4)nc3cn2)cn1,1.0
c1ncc(-c2ccc(OCCCN3CCCCC3)cc2)o1,1.0


In [154]:
grouped_herg.rename(columns={'hERG':'hERG_mean'}, inplace = True)
grouped_herg.to_csv('./hERG/temp/Grouped.csv')

In [155]:
final = pd.read_csv('./hERG/temp/hERG_Canonicalized.csv')
dupl_filter = pd.read_csv('./hERG/temp/Grouped.csv')

In [156]:
final

Unnamed: 0.1,Unnamed: 0,ID,Canonical_SMILES,hERG
0,0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1.0
1,1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1.0
2,2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1.0
3,3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1.0
4,4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1.0
...,...,...,...,...
21124,21162,ref4_10418,CC1(COc2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cncnc...,0.0
21125,21163,ref4_10419,Cc1ncc(C=C2CC3C4CCC5N(C)C(=O)C=CC5(C)C4CCC3(C)...,0.0
21126,21164,ref4_10420,O=C1CCCN1C1CCN(Cc2ccc(Oc3nc4ccccc4s3)cc2)CC1,0.0
21127,21165,ref4_10421,OC(c1cccnc1)(c1cccnc1)C(c1ccccc1)N1CCC1,0.0


In [157]:
dupl_filter

Unnamed: 0,Canonical_SMILES,hERG_mean
0,Brc1ccc(-n2ccc3ccncc32)cc1,1.0
1,Brc1ccc(Nc2ccc(CN3CCC4(CC3)OCCc3sccc34)cc2)cc1,1.0
2,Brc1ccc2[nH]c3c(c2c1)CCCC3Nc1ccccc1,0.0
3,Brc1ccc2c(NC3=NCC4(CN5CCC4CC5)O3)ncnn12,1.0
4,Brc1ccc2c(NC3=NC[C@@]4(CN5CCC4CC5)O3)ncnn12,1.0
...,...,...
14071,c1coc(-c2ccc(OCCCN3CCCCC3)cc2)c1,1.0
14072,c1csc(-c2ccc(OCCCN3CCCCC3)cc2)c1,1.0
14073,c1ncc(-c2cc3sc(N4CCC(N5CCCCC5)CC4)nc3cn2)cn1,1.0
14074,c1ncc(-c2ccc(OCCCN3CCCCC3)cc2)o1,1.0


In [158]:
herg_final = pd.merge(final,dupl_filter).drop_duplicates(['Canonical_SMILES']).sort_values(by=['ID'],axis=0)
herg_final

Unnamed: 0.1,Unnamed: 0,ID,Canonical_SMILES,hERG,hERG_mean
0,0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1.0,1.0
1,1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1.0,1.0
2,2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1.0,1.0
3,3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1.0,1.0
4,4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1.0,1.0
...,...,...,...,...,...
21124,21159,ref4_10415,CC1(C)CN(c2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cn...,0.0,0.0
21125,21160,ref4_10416,NC1=N[C@@]2(CO1)c1cc(-c3cncnc3)ccc1Oc1ccc(N3CC...,0.0,0.0
21126,21161,ref4_10417,CC(C)(O)COc1ccc2c(c1)[C@]1(COC(N)=N1)c1cc(-c3c...,0.0,0.0
21127,21162,ref4_10418,CC1(COc2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cncnc...,0.0,0.0


In [159]:
herg_final.to_csv('./hERG/temp/hERG_Grouped.csv')

In [160]:
df = pd.read_csv('./hERG/final/hERG_Final.csv')
df

Unnamed: 0,ID,Canonical_SMILES,hERG
0,ref1_00001,Brc1ccc(-n2ccc3ccncc32)cc1,1
1,ref1_00002,Brc1cnc(NCC2CC2)nc1,1
2,ref1_00003,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4ccnc4)ccc2-3)cs1...,1
3,ref1_00004,C(#Cc1cc(-c2[nH]nc3c2Cc2cc(Cn4cncn4)ccc2-3)cs1...,1
4,ref1_00005,C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2ccc(...,1
...,...,...,...
13896,ref4_10415,CC1(C)CN(c2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cn...,0
13897,ref4_10416,NC1=N[C@@]2(CO1)c1cc(-c3cncnc3)ccc1Oc1ccc(N3CC...,0
13898,ref4_10417,CC(C)(O)COc1ccc2c(c1)[C@]1(COC(N)=N1)c1cc(-c3c...,0
13899,ref4_10418,CC1(COc2ccc3c(c2)[C@]2(COC(N)=N2)c2cc(-c4cncnc...,0
