In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from rdkit import Chem
%matplotlib inline

In [2]:
homedir = os.path.expanduser("~/")
homedir = homedir+"AIChem/chemnet/chemnet/data/"
df = pd.read_csv(homedir+"validationset.txt", sep='\t')

In [3]:
# Add unique alphanumeric identifier
df['id'] = range(1, len(df.index)+1)
df['id'] = 'testid' + df['id'].astype(str)
print(df.shape)

(2895, 16)


In [4]:
df.head(5)

Unnamed: 0,RowID,CASRN,DTXSID,Name,Structure_Source,SMILES_Original,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI Key_QSARr,LD50_mgkg,EPA_category,GHS_category,very_toxic,nontoxic,id
0,1001,130209-82-4,DTXSID1041057,Latanoprost,EPA_DSSTox,CC(C)OC(=O)CCC\C=C/C[C@H]1[C@@H](O)C[C@@H](O)[...,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCC1=CC=C...,?,InChI=1S/C26H40O5/c1-19(2)31-26(30)13-9-4-3-8-...,GGXICVAJURFBLW-UHFFFAOYSA-N,,1.0,1.0,True,False,testid1
1,1002,123253-00-9,DTXSID10153915,"2H-Thieno(2,3-e)-1,2-thiazine-3-carboxamide, 6...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(NC2=CC=CC=N2)C2=C(C=...,CN1C(=C(NC2C=CC=CN=2)C2SC(Cl)=CC=2S1(=O)=O)C(=...,?,InChI=1S/C18H14ClN5O3S2/c1-24-16(18(25)23-14-7...,WMANCVNHWLMOGE-UHFFFAOYSA-N,,1.0,2.0,True,False,testid2
2,1003,123253-04-3,DTXSID90153918,"Propanoic acid, 2,2-dimethyl-, 2-methyl-3-((2-...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(OC(=O)C(C)(C)C)C2=C(...,CN1C(=C(OC(=O)C(C)(C)C)C2SC=CC=2S1(=O)=O)C(=O)...,?,"InChI=1S/C18H19N3O5S2/c1-18(2,3)17(23)26-14-13...",TWGBQNRSOGHBMD-UHFFFAOYSA-N,,1.0,2.0,True,False,testid3
3,1004,14255-87-9,DTXSID0045410,Parbendazole,EPA_DSSTox,CCCCC1=CC2=C(NC(NC(=O)OC)=N2)C=C1,CCCCC1=CC2N=C(NC=2C=C1)NC(=O)OC,?,InChI=1S/C13H17N3O2/c1-3-4-5-9-6-7-10-11(8-9)1...,YRWLZFXJFBZBEY-UHFFFAOYSA-N,,1.0,2.0,True,False,testid4
4,1005,50-56-6,DTXSID8048361,Oxytocin,EPA_DSSTox,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...,CC(C)CC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC...,?,InChI=1S/C43H66N12O12S2/c1-5-22(4)35-42(66)49-...,XNOPRXBHLZRZKH-UHFFFAOYSA-N,,1.0,2.0,True,False,testid5


# Ensure all SMILES are legit, and canonicalize SMILES

In [5]:
# Remove extraneous SMILES entry
df = df.join(df['Canonical_QSARr'].str.split(' ', 1, expand=True).rename(columns={0:'pre_smiles', 1:'Extraneous_SMILES'}))
df.head(5)

Unnamed: 0,RowID,CASRN,DTXSID,Name,Structure_Source,SMILES_Original,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI Key_QSARr,LD50_mgkg,EPA_category,GHS_category,very_toxic,nontoxic,id,pre_smiles,Extraneous_SMILES
0,1001,130209-82-4,DTXSID1041057,Latanoprost,EPA_DSSTox,CC(C)OC(=O)CCC\C=C/C[C@H]1[C@@H](O)C[C@@H](O)[...,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCC1=CC=C...,?,InChI=1S/C26H40O5/c1-19(2)31-26(30)13-9-4-3-8-...,GGXICVAJURFBLW-UHFFFAOYSA-N,,1.0,1.0,True,False,testid1,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCC1=CC=C...,
1,1002,123253-00-9,DTXSID10153915,"2H-Thieno(2,3-e)-1,2-thiazine-3-carboxamide, 6...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(NC2=CC=CC=N2)C2=C(C=...,CN1C(=C(NC2C=CC=CN=2)C2SC(Cl)=CC=2S1(=O)=O)C(=...,?,InChI=1S/C18H14ClN5O3S2/c1-24-16(18(25)23-14-7...,WMANCVNHWLMOGE-UHFFFAOYSA-N,,1.0,2.0,True,False,testid2,CN1C(=C(NC2C=CC=CN=2)C2SC(Cl)=CC=2S1(=O)=O)C(=...,
2,1003,123253-04-3,DTXSID90153918,"Propanoic acid, 2,2-dimethyl-, 2-methyl-3-((2-...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(OC(=O)C(C)(C)C)C2=C(...,CN1C(=C(OC(=O)C(C)(C)C)C2SC=CC=2S1(=O)=O)C(=O)...,?,"InChI=1S/C18H19N3O5S2/c1-18(2,3)17(23)26-14-13...",TWGBQNRSOGHBMD-UHFFFAOYSA-N,,1.0,2.0,True,False,testid3,CN1C(=C(OC(=O)C(C)(C)C)C2SC=CC=2S1(=O)=O)C(=O)...,
3,1004,14255-87-9,DTXSID0045410,Parbendazole,EPA_DSSTox,CCCCC1=CC2=C(NC(NC(=O)OC)=N2)C=C1,CCCCC1=CC2N=C(NC=2C=C1)NC(=O)OC,?,InChI=1S/C13H17N3O2/c1-3-4-5-9-6-7-10-11(8-9)1...,YRWLZFXJFBZBEY-UHFFFAOYSA-N,,1.0,2.0,True,False,testid4,CCCCC1=CC2N=C(NC=2C=C1)NC(=O)OC,
4,1005,50-56-6,DTXSID8048361,Oxytocin,EPA_DSSTox,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...,CC(C)CC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC...,?,InChI=1S/C43H66N12O12S2/c1-5-22(4)35-42(66)49-...,XNOPRXBHLZRZKH-UHFFFAOYSA-N,,1.0,2.0,True,False,testid5,CC(C)CC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC...,


In [6]:
# Check for invalid SMILES
mol_list = [Chem.MolFromSmiles(x) for x in df['pre_smiles']]
invalid = len([x for x in mol_list if x is None])
print("No. of invalid entries: "+str(invalid))

No. of invalid entries: 0


In [7]:
# Canonicalize SMILES
newdf = []
for index, row in df.iterrows():
    smiles_string = df['pre_smiles'][index]
    mol = Chem.MolFromSmiles(smiles_string)
    newdf.append(Chem.MolToSmiles(mol))

In [8]:
# Replace SMILES with canonicalized versions
add_df = pd.DataFrame(np.asarray(newdf),columns=["smiles"])
print(df.shape)
df = pd.concat([df, add_df], axis=1)
print(df.shape)
df = df.drop(['pre_smiles'], axis=1)
print(df.shape)

(2895, 18)
(2895, 19)
(2895, 18)


# Standardize labels

In [9]:
# Rename columns
df = df.rename(columns={'very_toxic': 'verytoxic','nontoxic': 'nontoxic', \
                   'EPA_category': 'epa','GHS_category': 'ghs','LD50_mgkg': 'ld50'})
df.head(5)

Unnamed: 0,RowID,CASRN,DTXSID,Name,Structure_Source,SMILES_Original,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI Key_QSARr,ld50,epa,ghs,verytoxic,nontoxic,id,Extraneous_SMILES,smiles
0,1001,130209-82-4,DTXSID1041057,Latanoprost,EPA_DSSTox,CC(C)OC(=O)CCC\C=C/C[C@H]1[C@@H](O)C[C@@H](O)[...,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCC1=CC=C...,?,InChI=1S/C26H40O5/c1-19(2)31-26(30)13-9-4-3-8-...,GGXICVAJURFBLW-UHFFFAOYSA-N,,1.0,1.0,True,False,testid1,,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1
1,1002,123253-00-9,DTXSID10153915,"2H-Thieno(2,3-e)-1,2-thiazine-3-carboxamide, 6...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(NC2=CC=CC=N2)C2=C(C=...,CN1C(=C(NC2C=CC=CN=2)C2SC(Cl)=CC=2S1(=O)=O)C(=...,?,InChI=1S/C18H14ClN5O3S2/c1-24-16(18(25)23-14-7...,WMANCVNHWLMOGE-UHFFFAOYSA-N,,1.0,2.0,True,False,testid2,,CN1C(C(=O)Nc2ccccn2)=C(Nc2ccccn2)c2sc(Cl)cc2S1...
2,1003,123253-04-3,DTXSID90153918,"Propanoic acid, 2,2-dimethyl-, 2-methyl-3-((2-...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(OC(=O)C(C)(C)C)C2=C(...,CN1C(=C(OC(=O)C(C)(C)C)C2SC=CC=2S1(=O)=O)C(=O)...,?,"InChI=1S/C18H19N3O5S2/c1-18(2,3)17(23)26-14-13...",TWGBQNRSOGHBMD-UHFFFAOYSA-N,,1.0,2.0,True,False,testid3,,CN1C(C(=O)Nc2ccccn2)=C(OC(=O)C(C)(C)C)c2sccc2S...
3,1004,14255-87-9,DTXSID0045410,Parbendazole,EPA_DSSTox,CCCCC1=CC2=C(NC(NC(=O)OC)=N2)C=C1,CCCCC1=CC2N=C(NC=2C=C1)NC(=O)OC,?,InChI=1S/C13H17N3O2/c1-3-4-5-9-6-7-10-11(8-9)1...,YRWLZFXJFBZBEY-UHFFFAOYSA-N,,1.0,2.0,True,False,testid4,,CCCCc1ccc2[nH]c(NC(=O)OC)nc2c1
4,1005,50-56-6,DTXSID8048361,Oxytocin,EPA_DSSTox,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...,CC(C)CC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC...,?,InChI=1S/C43H66N12O12S2/c1-5-22(4)35-42(66)49-...,XNOPRXBHLZRZKH-UHFFFAOYSA-N,,1.0,2.0,True,False,testid5,,CCC(C)C1NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(N)CSSCC(C...


In [10]:
# Replace T/F with integers
df['verytoxic'].replace(False, 0, inplace=True)
df['verytoxic'].replace(True, 1, inplace=True)
df['nontoxic'].replace(False, 0, inplace=True)
df['nontoxic'].replace(True, 1, inplace=True)

In [11]:
# Rename EPA/GHS category to start from zero
df['epa'] = df['epa'] - 1
df['ghs'] = df['ghs'] - 1

In [12]:
# Apply log transformation to ld50
df['logld50'] = np.log(df['ld50'])

In [13]:
df.to_csv(homedir+"tox_niehs_ext_raw.csv", index=False)
df.head(5)

Unnamed: 0,RowID,CASRN,DTXSID,Name,Structure_Source,SMILES_Original,Canonical_QSARr,Salt_Solvent,InChI_Code_QSARr,InChI Key_QSARr,ld50,epa,ghs,verytoxic,nontoxic,id,Extraneous_SMILES,smiles,logld50
0,1001,130209-82-4,DTXSID1041057,Latanoprost,EPA_DSSTox,CC(C)OC(=O)CCC\C=C/C[C@H]1[C@@H](O)C[C@@H](O)[...,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCC1=CC=C...,?,InChI=1S/C26H40O5/c1-19(2)31-26(30)13-9-4-3-8-...,GGXICVAJURFBLW-UHFFFAOYSA-N,,0.0,0.0,1.0,0.0,testid1,,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,
1,1002,123253-00-9,DTXSID10153915,"2H-Thieno(2,3-e)-1,2-thiazine-3-carboxamide, 6...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(NC2=CC=CC=N2)C2=C(C=...,CN1C(=C(NC2C=CC=CN=2)C2SC(Cl)=CC=2S1(=O)=O)C(=...,?,InChI=1S/C18H14ClN5O3S2/c1-24-16(18(25)23-14-7...,WMANCVNHWLMOGE-UHFFFAOYSA-N,,0.0,1.0,1.0,0.0,testid2,,CN1C(C(=O)Nc2ccccn2)=C(Nc2ccccn2)c2sc(Cl)cc2S1...,
2,1003,123253-04-3,DTXSID90153918,"Propanoic acid, 2,2-dimethyl-, 2-methyl-3-((2-...",EPA_DSSTox,CN1C(C(=O)NC2=CC=CC=N2)=C(OC(=O)C(C)(C)C)C2=C(...,CN1C(=C(OC(=O)C(C)(C)C)C2SC=CC=2S1(=O)=O)C(=O)...,?,"InChI=1S/C18H19N3O5S2/c1-18(2,3)17(23)26-14-13...",TWGBQNRSOGHBMD-UHFFFAOYSA-N,,0.0,1.0,1.0,0.0,testid3,,CN1C(C(=O)Nc2ccccn2)=C(OC(=O)C(C)(C)C)c2sccc2S...,
3,1004,14255-87-9,DTXSID0045410,Parbendazole,EPA_DSSTox,CCCCC1=CC2=C(NC(NC(=O)OC)=N2)C=C1,CCCCC1=CC2N=C(NC=2C=C1)NC(=O)OC,?,InChI=1S/C13H17N3O2/c1-3-4-5-9-6-7-10-11(8-9)1...,YRWLZFXJFBZBEY-UHFFFAOYSA-N,,0.0,1.0,1.0,0.0,testid4,,CCCCc1ccc2[nH]c(NC(=O)OC)nc2c1,
4,1005,50-56-6,DTXSID8048361,Oxytocin,EPA_DSSTox,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...,CC(C)CC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC...,?,InChI=1S/C43H66N12O12S2/c1-5-22(4)35-42(66)49-...,XNOPRXBHLZRZKH-UHFFFAOYSA-N,,0.0,1.0,1.0,0.0,testid5,,CCC(C)C1NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(N)CSSCC(C...,
