In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from rdkit import Chem
%matplotlib inline

In [2]:
homedir = os.path.expanduser("~/")
homedir = homedir+"AIChem/chemnet/chemnet/data/"
df = pd.read_csv(homedir+"trainingset_171130.txt", sep='\t')

In [3]:
# Add unique alphanumeric identifier
df['id'] = range(1, len(df.index)+1)
df['id'] = 'molid' + df['id'].astype(str)
print(df.shape)

(8994, 14)


In [4]:
df.head(5)

Unnamed: 0,CASRN,DTXSID,Chemical_Name,Structure_Source,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI_Key_QSARr,very_toxic,nontoxic,LD50_mgkg,EPA_category,GHS_category,id
0,100-00-5,DTXSID5020281,1-Chloro-4-nitrobenzene,EPA_DSSTox,[O-][N+](=O)C1C=CC(Cl)=CC=1,?,InChI=1S/C6H4ClNO2/c7-5-1-3-6(4-2-5)8(9)10/h1-4H,CZGCEKJOLUNIFY-UHFFFAOYSA-N,False,False,460.0,2.0,4.0,molid1
1,100-01-6,DTXSID8020961,4-Nitroaniline,EPA_DSSTox,NC1=CC=C(C=C1)[N+]([O-])=O,?,InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...,TYMLOMAKGOJONV-UHFFFAOYSA-N,False,False,750.0,3.0,4.0,molid2
2,100-02-7,DTXSID0021834,4-Nitrophenol,EPA_DSSTox,[O-][N+](=O)C1C=CC(O)=CC=1,?,"InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H",BTJIUGUIPKRLHP-UHFFFAOYSA-N,False,False,170.0,2.0,3.0,molid3
3,100-14-1,DTXSID4025745,4-Nitrobenzyl chloride,EPA_DSSTox,[O-][N+](=O)C1C=CC(CCl)=CC=1,?,InChI=1S/C7H6ClNO2/c8-5-6-1-3-7(4-2-6)9(10)11/...,KGCNHWXDPDPSBV-UHFFFAOYSA-N,False,False,1809.0,3.0,4.0,molid4
4,100-15-2,DTXSID7025635,N-Methyl-4-nitroaniline,EPA_DSSTox,CNC1C=CC(=CC=1)[N+]([O-])=O,?,InChI=1S/C7H8N2O2/c1-8-6-2-4-7(5-3-6)9(10)11/h...,XIFJZJPMHNUGRA-UHFFFAOYSA-N,False,True,,3.0,5.0,molid5


# Ensure all SMILES are legit, and canonicalize SMILES

In [5]:
# Remove extraneous SMILES entry
df = df.join(df['Canonical_QSARr'].str.split(' ', 1, expand=True).rename(columns={0:'pre_smiles', 1:'Extraneous_SMILES'}))
df.head(5)

Unnamed: 0,CASRN,DTXSID,Chemical_Name,Structure_Source,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI_Key_QSARr,very_toxic,nontoxic,LD50_mgkg,EPA_category,GHS_category,id,pre_smiles,Extraneous_SMILES
0,100-00-5,DTXSID5020281,1-Chloro-4-nitrobenzene,EPA_DSSTox,[O-][N+](=O)C1C=CC(Cl)=CC=1,?,InChI=1S/C6H4ClNO2/c7-5-1-3-6(4-2-5)8(9)10/h1-4H,CZGCEKJOLUNIFY-UHFFFAOYSA-N,False,False,460.0,2.0,4.0,molid1,[O-][N+](=O)C1C=CC(Cl)=CC=1,
1,100-01-6,DTXSID8020961,4-Nitroaniline,EPA_DSSTox,NC1=CC=C(C=C1)[N+]([O-])=O,?,InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...,TYMLOMAKGOJONV-UHFFFAOYSA-N,False,False,750.0,3.0,4.0,molid2,NC1=CC=C(C=C1)[N+]([O-])=O,
2,100-02-7,DTXSID0021834,4-Nitrophenol,EPA_DSSTox,[O-][N+](=O)C1C=CC(O)=CC=1,?,"InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H",BTJIUGUIPKRLHP-UHFFFAOYSA-N,False,False,170.0,2.0,3.0,molid3,[O-][N+](=O)C1C=CC(O)=CC=1,
3,100-14-1,DTXSID4025745,4-Nitrobenzyl chloride,EPA_DSSTox,[O-][N+](=O)C1C=CC(CCl)=CC=1,?,InChI=1S/C7H6ClNO2/c8-5-6-1-3-7(4-2-6)9(10)11/...,KGCNHWXDPDPSBV-UHFFFAOYSA-N,False,False,1809.0,3.0,4.0,molid4,[O-][N+](=O)C1C=CC(CCl)=CC=1,
4,100-15-2,DTXSID7025635,N-Methyl-4-nitroaniline,EPA_DSSTox,CNC1C=CC(=CC=1)[N+]([O-])=O,?,InChI=1S/C7H8N2O2/c1-8-6-2-4-7(5-3-6)9(10)11/h...,XIFJZJPMHNUGRA-UHFFFAOYSA-N,False,True,,3.0,5.0,molid5,CNC1C=CC(=CC=1)[N+]([O-])=O,


In [6]:
# Check for invalid SMILES
mol_list = [Chem.MolFromSmiles(x) for x in df['pre_smiles']]
invalid = len([x for x in mol_list if x is None])
print("No. of invalid entries: "+str(invalid))

No. of invalid entries: 0


In [7]:
# Canonicalize SMILES
newdf = []
for index, row in df.iterrows():
    smiles_string = df['pre_smiles'][index]
    mol = Chem.MolFromSmiles(smiles_string)
    newdf.append(Chem.MolToSmiles(mol))

In [8]:
# Replace SMILES with canonicalized versions
add_df = pd.DataFrame(np.asarray(newdf),columns=["smiles"])
print(df.shape)
df = pd.concat([df, add_df], axis=1)
print(df.shape)
df = df.drop(['pre_smiles'], axis=1)
print(df.shape)

(8994, 16)
(8994, 17)
(8994, 16)


# Standardize labels

In [9]:
# Rename columns
df = df.rename(columns={'very_toxic': 'verytoxic','nontoxic': 'nontoxic', \
                   'EPA_category': 'epa','GHS_category': 'ghs','LD50_mgkg': 'ld50'})
df.head(5)

Unnamed: 0,CASRN,DTXSID,Chemical_Name,Structure_Source,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI_Key_QSARr,verytoxic,nontoxic,ld50,epa,ghs,id,Extraneous_SMILES,smiles
0,100-00-5,DTXSID5020281,1-Chloro-4-nitrobenzene,EPA_DSSTox,[O-][N+](=O)C1C=CC(Cl)=CC=1,?,InChI=1S/C6H4ClNO2/c7-5-1-3-6(4-2-5)8(9)10/h1-4H,CZGCEKJOLUNIFY-UHFFFAOYSA-N,False,False,460.0,2.0,4.0,molid1,,O=[N+]([O-])c1ccc(Cl)cc1
1,100-01-6,DTXSID8020961,4-Nitroaniline,EPA_DSSTox,NC1=CC=C(C=C1)[N+]([O-])=O,?,InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...,TYMLOMAKGOJONV-UHFFFAOYSA-N,False,False,750.0,3.0,4.0,molid2,,Nc1ccc([N+](=O)[O-])cc1
2,100-02-7,DTXSID0021834,4-Nitrophenol,EPA_DSSTox,[O-][N+](=O)C1C=CC(O)=CC=1,?,"InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H",BTJIUGUIPKRLHP-UHFFFAOYSA-N,False,False,170.0,2.0,3.0,molid3,,O=[N+]([O-])c1ccc(O)cc1
3,100-14-1,DTXSID4025745,4-Nitrobenzyl chloride,EPA_DSSTox,[O-][N+](=O)C1C=CC(CCl)=CC=1,?,InChI=1S/C7H6ClNO2/c8-5-6-1-3-7(4-2-6)9(10)11/...,KGCNHWXDPDPSBV-UHFFFAOYSA-N,False,False,1809.0,3.0,4.0,molid4,,O=[N+]([O-])c1ccc(CCl)cc1
4,100-15-2,DTXSID7025635,N-Methyl-4-nitroaniline,EPA_DSSTox,CNC1C=CC(=CC=1)[N+]([O-])=O,?,InChI=1S/C7H8N2O2/c1-8-6-2-4-7(5-3-6)9(10)11/h...,XIFJZJPMHNUGRA-UHFFFAOYSA-N,False,True,,3.0,5.0,molid5,,CNc1ccc([N+](=O)[O-])cc1


In [10]:
# Replace T/F with integers
df['verytoxic'].replace(False, 0, inplace=True)
df['verytoxic'].replace(True, 1, inplace=True)
df['nontoxic'].replace(False, 0, inplace=True)
df['nontoxic'].replace(True, 1, inplace=True)

In [11]:
# Rename EPA/GHS category to start from zero
df['epa'] = df['epa'] - 1
df['ghs'] = df['ghs'] - 1

In [12]:
# Apply log transformation to ld50
df['logld50'] = np.log(df['ld50'])

In [13]:
df.to_csv(homedir+"tox_niehs_all_raw.csv", index=False)
df.head(5)

Unnamed: 0,CASRN,DTXSID,Chemical_Name,Structure_Source,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI_Key_QSARr,verytoxic,nontoxic,ld50,epa,ghs,id,Extraneous_SMILES,smiles,logld50
0,100-00-5,DTXSID5020281,1-Chloro-4-nitrobenzene,EPA_DSSTox,[O-][N+](=O)C1C=CC(Cl)=CC=1,?,InChI=1S/C6H4ClNO2/c7-5-1-3-6(4-2-5)8(9)10/h1-4H,CZGCEKJOLUNIFY-UHFFFAOYSA-N,0.0,0.0,460.0,1.0,3.0,molid1,,O=[N+]([O-])c1ccc(Cl)cc1,6.131226
1,100-01-6,DTXSID8020961,4-Nitroaniline,EPA_DSSTox,NC1=CC=C(C=C1)[N+]([O-])=O,?,InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...,TYMLOMAKGOJONV-UHFFFAOYSA-N,0.0,0.0,750.0,2.0,3.0,molid2,,Nc1ccc([N+](=O)[O-])cc1,6.620073
2,100-02-7,DTXSID0021834,4-Nitrophenol,EPA_DSSTox,[O-][N+](=O)C1C=CC(O)=CC=1,?,"InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,8H",BTJIUGUIPKRLHP-UHFFFAOYSA-N,0.0,0.0,170.0,1.0,2.0,molid3,,O=[N+]([O-])c1ccc(O)cc1,5.135798
3,100-14-1,DTXSID4025745,4-Nitrobenzyl chloride,EPA_DSSTox,[O-][N+](=O)C1C=CC(CCl)=CC=1,?,InChI=1S/C7H6ClNO2/c8-5-6-1-3-7(4-2-6)9(10)11/...,KGCNHWXDPDPSBV-UHFFFAOYSA-N,0.0,0.0,1809.0,2.0,3.0,molid4,,O=[N+]([O-])c1ccc(CCl)cc1,7.500529
4,100-15-2,DTXSID7025635,N-Methyl-4-nitroaniline,EPA_DSSTox,CNC1C=CC(=CC=1)[N+]([O-])=O,?,InChI=1S/C7H8N2O2/c1-8-6-2-4-7(5-3-6)9(10)11/h...,XIFJZJPMHNUGRA-UHFFFAOYSA-N,0.0,1.0,,2.0,4.0,molid5,,CNc1ccc([N+](=O)[O-])cc1,


# Deal with duplicate entries

In [14]:
mask = df.duplicated('smiles', keep=False)

In [15]:
#Separate out unique and duplicate entries
df_uni = df[~mask]
df_dup = df[mask]
print(df.shape, df_uni.shape, df_dup.shape)

(8994, 17) (8112, 17) (882, 17)


In [16]:
# Compute mean of duplicate entries
avg_df = df_dup.groupby('smiles', as_index=False).mean()
avg_df.head(25)

Unnamed: 0,smiles,verytoxic,nontoxic,ld50,epa,ghs,logld50
0,BrCC=CCBr,0.0,0.0,68.5,1.0,2.0,4.222311
1,C,0.0,1.0,,2.0,4.0,
2,C#N,0.666667,0.0,44.813333,0.333333,1.333333,2.761384
3,C1CC2OC2CC1C1CO1,0.0,1.0,2480.0,2.0,4.0,7.805955
4,C1CCC(NC2CCCCC2)CC1,0.0,0.5,1476.5,1.5,3.5,6.888562
5,C1CCNCC1,0.0,0.0,266.5,1.0,2.5,5.440907
6,C1OC1C1CO1,0.0,0.0,144.0,1.0,2.0,4.851908
7,C=C(C)C1CC=C(C)C(=O)C1,0.0,0.5,3270.0,2.0,3.5,7.949721
8,C=C(C)C1CC=C(C)CC1,0.0,1.0,4700.5,2.5,4.0,8.453376
9,C=CC(=O)NCC(OC)OC,0.0,0.0,1199.0,2.0,3.0,7.089243


In [17]:
# Drop unreliable labels
print(avg_df.shape)
avg_df = avg_df[avg_df["verytoxic"] != 0.5]
print(avg_df.shape)
avg_df = avg_df[avg_df["nontoxic"] != 0.5]
print(avg_df.shape)
avg_df = avg_df[avg_df["epa"] != 0.5]
avg_df = avg_df[avg_df["epa"] != 1.5]
avg_df = avg_df[avg_df["epa"] != 2.5]
avg_df = avg_df[avg_df["epa"] != 3.5]
print(avg_df.shape)
avg_df = avg_df[avg_df["ghs"] != 0.5]
avg_df = avg_df[avg_df["ghs"] != 1.5]
avg_df = avg_df[avg_df["ghs"] != 2.5]
avg_df = avg_df[avg_df["ghs"] != 3.5]
avg_df = avg_df[avg_df["ghs"] != 4.5]
print(avg_df.shape)
avg_df.head(25)

(401, 7)
(396, 7)
(347, 7)
(291, 7)
(268, 7)


Unnamed: 0,smiles,verytoxic,nontoxic,ld50,epa,ghs,logld50
0,BrCC=CCBr,0.0,0.0,68.5,1.0,2.0,4.222311
1,C,0.0,1.0,,2.0,4.0,
2,C#N,0.666667,0.0,44.813333,0.333333,1.333333,2.761384
3,C1CC2OC2CC1C1CO1,0.0,1.0,2480.0,2.0,4.0,7.805955
6,C1OC1C1CO1,0.0,0.0,144.0,1.0,2.0,4.851908
9,C=CC(=O)NCC(OC)OC,0.0,0.0,1199.0,2.0,3.0,7.089243
10,C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12,0.0,0.0,382.0,1.333333,2.333333,5.858009
11,C=CCC1=C(C)C(OC(=O)C2C(C=C(C)C)C2(C)C)CC1=O,0.0,0.0,811.666667,1.666667,3.0,6.592864
14,C=CC[N+](C)(C)CC=C,0.0,1.0,2760.0,2.0,4.0,7.919191
17,CC(=O)OC1CC(C)CCC1C(C)C,0.0,1.0,7620.0,3.0,4.0,8.938532


In [18]:
# Round to nearest integer (select nearest category)
avg_df = avg_df.round({'verytoxic': 0, 'nontoxic': 0, 'epa': 0, 'ghs':0})
avg_df.head(25)

Unnamed: 0,smiles,verytoxic,nontoxic,ld50,epa,ghs,logld50
0,BrCC=CCBr,0.0,0.0,68.5,1.0,2.0,4.222311
1,C,0.0,1.0,,2.0,4.0,
2,C#N,1.0,0.0,44.813333,0.0,1.0,2.761384
3,C1CC2OC2CC1C1CO1,0.0,1.0,2480.0,2.0,4.0,7.805955
6,C1OC1C1CO1,0.0,0.0,144.0,1.0,2.0,4.851908
9,C=CC(=O)NCC(OC)OC,0.0,0.0,1199.0,2.0,3.0,7.089243
10,C=CC1CN2CCC1CC2C(O)c1ccnc2ccc(OC)cc12,0.0,0.0,382.0,1.0,2.0,5.858009
11,C=CCC1=C(C)C(OC(=O)C2C(C=C(C)C)C2(C)C)CC1=O,0.0,0.0,811.666667,2.0,3.0,6.592864
14,C=CC[N+](C)(C)CC=C,0.0,1.0,2760.0,2.0,4.0,7.919191
17,CC(=O)OC1CC(C)CCC1C(C)C,0.0,1.0,7620.0,3.0,4.0,8.938532


In [19]:
# Match up average predictions to SMILES and drop duplicate entries
print(df_dup.shape)
df_dup = df_dup.drop(['verytoxic', 'nontoxic', 'epa', 'ghs', 'ld50', 'logld50'], axis=1)
df_dup = pd.merge(df_dup, avg_df, how="right", on=["smiles"])
print(df_dup.shape)
df_dup = df_dup.drop_duplicates(subset=['smiles'], keep="first")
print(df_dup.shape)

(882, 17)
(615, 17)
(268, 17)


In [20]:
df_dup.head(5)

Unnamed: 0,CASRN,DTXSID,Chemical_Name,Structure_Source,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI_Key_QSARr,id,Extraneous_SMILES,smiles,verytoxic,nontoxic,ld50,epa,ghs,logld50
0,100-35-6,DTXSID8043868,"Ethanamine, 2-chloro-N,N-diethyl-",EPA_DSSTox,CCN(CCCl)CC,?,"InChI=1S/C6H14ClN/c1-3-8(4-2)6-5-7/h3-6H2,1-2H3",YMDNODNLFSHHCV-UHFFFAOYSA-N,molid12,,CCN(CC)CCCl,1.0,0.0,33.5,0.0,1.0,3.372618
2,1003-40-3,DTXSID70143115,"Pyridine, 4-amino-, hydrochloride",EPA_DSSTox,NC1C=CN=CC=1,Cl,"InChI=1S/C5H6N2/c6-5-1-3-7-4-2-5/h1-4H,(H2,6,7)",NUKYPUAOHBNCPY-UHFFFAOYSA-N,molid52,,Nc1ccncc1,1.0,0.0,23.0,0.0,1.0,3.052397
4,10049-60-2,,,Public_CrossChecked,CCC(C)N,[Cl-],"InChI=1S/C4H11N/c1-3-4(2)5/h4H,3,5H2,1-2H3",BHRZNVHARXXAHW-UHFFFAOYSA-N,molid63,,CCC(C)N,0.0,0.0,1745.333333,2.0,3.0,6.971154
10,10061-01-5,DTXSID1032305,(Z)-Dichloropropene,EPA_DSSTox,ClC=CCCl,?,"InChI=1S/C3H4Cl2/c4-2-1-3-5/h1-2H,3H2",UOORRWUZONOOLO-UHFFFAOYSA-N,molid70,,ClC=CCCl,0.0,0.0,96.0,1.0,2.0,4.552809
12,10326-41-7,DTXSID0047030,D-Lactic acid,EPA_DSSTox,CC(O)C(O)=O,?,"InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)",JVTAAEKCZFNVCJ-UHFFFAOYSA-N,molid256,,CC(O)C(=O)O,0.0,1.0,3182.25,2.0,4.0,8.04197


In [21]:
# Add reliable averaged de-duplicated entries back to unique entries
df2 = pd.concat([df_dup, df_uni], axis=0)
print(df2.shape)
print(df2.smiles.unique().shape)
print(df.smiles.unique().shape)

(8380, 17)
(8380,)
(8513,)


In [22]:
# Reset index of df
df2 = df2.reset_index(drop=True)
df2.head(5)

Unnamed: 0,CASRN,Canonical_QSARr,Chemical_Name,DTXSID,Extraneous_SMILES,InChI_Code_QSARr,InChI_Key_QSARr,Salt_Solvent,Structure_Source,epa,ghs,id,ld50,logld50,nontoxic,smiles,verytoxic
0,100-35-6,CCN(CCCl)CC,"Ethanamine, 2-chloro-N,N-diethyl-",DTXSID8043868,,"InChI=1S/C6H14ClN/c1-3-8(4-2)6-5-7/h3-6H2,1-2H3",YMDNODNLFSHHCV-UHFFFAOYSA-N,?,EPA_DSSTox,0.0,1.0,molid12,33.5,3.372618,0.0,CCN(CC)CCCl,1.0
1,1003-40-3,NC1C=CN=CC=1,"Pyridine, 4-amino-, hydrochloride",DTXSID70143115,,"InChI=1S/C5H6N2/c6-5-1-3-7-4-2-5/h1-4H,(H2,6,7)",NUKYPUAOHBNCPY-UHFFFAOYSA-N,Cl,EPA_DSSTox,0.0,1.0,molid52,23.0,3.052397,0.0,Nc1ccncc1,1.0
2,10049-60-2,CCC(C)N,,,,"InChI=1S/C4H11N/c1-3-4(2)5/h4H,3,5H2,1-2H3",BHRZNVHARXXAHW-UHFFFAOYSA-N,[Cl-],Public_CrossChecked,2.0,3.0,molid63,1745.333333,6.971154,0.0,CCC(C)N,0.0
3,10061-01-5,ClC=CCCl,(Z)-Dichloropropene,DTXSID1032305,,"InChI=1S/C3H4Cl2/c4-2-1-3-5/h1-2H,3H2",UOORRWUZONOOLO-UHFFFAOYSA-N,?,EPA_DSSTox,1.0,2.0,molid70,96.0,4.552809,0.0,ClC=CCCl,0.0
4,10326-41-7,CC(O)C(O)=O,D-Lactic acid,DTXSID0047030,,"InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)",JVTAAEKCZFNVCJ-UHFFFAOYSA-N,?,EPA_DSSTox,2.0,4.0,molid256,3182.25,8.04197,1.0,CC(O)C(=O)O,0.0


In [23]:
df2.to_csv(homedir+"tox_niehs_all.csv", index=False)