In [1]:
import pandas as pd
import numpy as np
from molvs import standardize_smiles

### Generate train data and labels (without standardizing)

In [2]:
# read tox21 train dataset with labels
# selected nr_ahr assay for final project
nr_ahr = pd.read_csv('../raw_data/nr-ahr.smiles', sep='\t', header=None)
nr_ahr = nr_ahr.rename(columns={0: 'compounds', 1: 'id', 2: 'label'})
nr_ahr.head()

Unnamed: 0,compounds,id,label
0,CC(O)=O.[H][C@@]12CCC3=CC(=CC=C3[C@@]1(C)CCC[C...,NCGC00255644-01,0
1,Cl.C[C@@H](NCCCC1=CC=CC(=C1)C(F)(F)F)C2=CC=CC3...,NCGC00181002-01,0
2,CC(C)OC(=O)C1=C(C)NC(N)=C(C1C2=CC(=CC=C2)[N+](...,NCGC00167436-01,0
3,Cl.CN(C)C(=O)C1(CCN(CCC2(CN(CCO2)C(=O)C3=CC=CC...,NCGC00254013-01,0
4,Cl.CCOC(=O)O[C@H](C)OC(=O)C1=CC=C2N(CC3=NOC(=C...,NCGC00254071-01,0


In [3]:
# check train dataset for duplicates
print('Number of compounds in raw dataset:', len(nr_ahr['compounds']))
print('Number of unique compounds:', len(nr_ahr['compounds'].unique()))

Number of compounds in raw dataset: 8169
Number of unique compounds: 6716


In [4]:
# remove duplicates
nr_ahr = nr_ahr.drop_duplicates(subset='compounds').reset_index(drop=True)
len(nr_ahr)

6716

In [5]:
# check label column for inconclusive or tests not run (x)
nr_ahr.label.unique()

array([0, 1])

In [6]:
# generate duplicate-free training dataset csv file
#nr_ahr.to_csv('../processed_data/nr_ahr_train.csv')

In [7]:
# split into train_data and train_labels
train_data = nr_ahr['compounds']
train_labels = nr_ahr['label']

print(len(train_data))
print(len(train_labels))

6716
6716


### Standardize train data using MolVS

In [8]:
# identify rows with SMILES errors
train_error_rows = []

for i, row in enumerate(nr_ahr.compounds):
    try:
        standardize_smiles(row)
    except Exception as e: 
        print('Error at index {}: {!r}'.format(i, row))
        print(e)
        train_error_rows.append(i)

print()
print('Number of rows with errors:', len(train_error_rows))

RDKit ERROR: [10:32:27] Explicit valence for atom # 3 Si, 8, is greater than permitted


Error at index 1390: '[NH4+].[NH4+].F[Si--](F)(F)(F)(F)F'
Explicit valence for atom # 3 Si, 8, is greater than permitted


RDKit ERROR: [10:32:29] Explicit valence for atom # 0 Cl, 2, is greater than permitted


Error at index 2245: '[Cl-][Pt]1([Cl-])NCCN1'
Explicit valence for atom # 0 Cl, 2, is greater than permitted


RDKit ERROR: [10:32:29] Can't kekulize mol.  Unkekulized atoms: 3 10
RDKit ERROR: 
RDKit ERROR: [10:32:30] Explicit valence for atom # 2 Cl, 2, is greater than permitted


Error at index 3758: '[NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-]'
Explicit valence for atom # 2 Cl, 2, is greater than permitted


RDKit ERROR: [10:32:32] Explicit valence for atom # 3 Si, 8, is greater than permitted


Error at index 5524: '[Na+].[Na+].F[Si--](F)(F)(F)(F)F'
Explicit valence for atom # 3 Si, 8, is greater than permitted


RDKit ERROR: [10:32:33] Explicit valence for atom # 7 Mg, 4, is greater than permitted


Error at index 6369: 'O.O.O.O.O=C1O[Mg]2(OC(=O)C3=CC=CC=C3O2)OC4=CC=CC=C14'
Explicit valence for atom # 7 Mg, 4, is greater than permitted

Number of rows with errors: 5


In [9]:
# add error rows that were missed by for loop
train_error_rows.append(2888)
train_error_rows.append(4194)

In [10]:
# remove rows with SMILES errors
train_drop_rows = [nr_ahr.index[i] for i in train_error_rows]
print(len(train_drop_rows))

nr_ahr_std = nr_ahr.drop(train_drop_rows).reset_index(drop=True)
print(len(nr_ahr_std))

7
6709


In [11]:
# standardize compound SMILES using MolVS
nr_ahr_std['std_compounds'] = nr_ahr_std.compounds.map(lambda x: standardize_smiles(x))
print(len(nr_ahr_std))
nr_ahr_std.head()

6709


Unnamed: 0,compounds,id,label,std_compounds
0,CC(O)=O.[H][C@@]12CCC3=CC(=CC=C3[C@@]1(C)CCC[C...,NCGC00255644-01,0,CC(=O)O.CC(C)c1ccc2c(c1)CC[C@@H]1[C@]2(C)CCC[C...
1,Cl.C[C@@H](NCCCC1=CC=CC(=C1)C(F)(F)F)C2=CC=CC3...,NCGC00181002-01,0,C[C@@H](NCCCc1cccc(C(F)(F)F)c1)c1cccc2ccccc12.Cl
2,CC(C)OC(=O)C1=C(C)NC(N)=C(C1C2=CC(=CC=C2)[N+](...,NCGC00167436-01,0,CC1=C(C(=O)OC(C)C)C(c2cccc([N+](=O)[O-])c2)C(C...
3,Cl.CN(C)C(=O)C1(CCN(CCC2(CN(CCO2)C(=O)C3=CC=CC...,NCGC00254013-01,0,CN(C)C(=O)C1(N2CCCCC2)CCN(CCC2(c3ccc(Cl)c(Cl)c...
4,Cl.CCOC(=O)O[C@H](C)OC(=O)C1=CC=C2N(CC3=NOC(=C...,NCGC00254071-01,0,CCOC(=O)O[C@H](C)OC(=O)c1ccc2c(c1)cc(C(=O)NC1C...


In [12]:
# generate standardized training dataset csv file
#nr_ahr_std.to_csv('../processed_data/nr_ahr_std_train.csv')

In [12]:
# split into train_data and train_labels
train_data_std = nr_ahr_std['compounds']
train_labels_std = nr_ahr_std['label']

print(len(train_data_std))
print(len(train_labels_std))

6709
6709


### Generate test data and labels (without standardizing)

In [15]:
# read tox21 score dataset
score = pd.read_csv('../raw_data/tox21_10k_challenge_score.smiles', sep='\t')
score = score.rename(columns={'#SMILES': 'compounds', 'Sample ID': 'id'})
score.head()

Unnamed: 0,compounds,id
0,OC(=O)\C=C/C(O)=O.C[C@]12CC=C3[C@@H](CCC4=CC(=...,NCGC00261900-01
1,[Na+].NC1=NC=NC2=C1N=C(Br)N2C1OC2CO[P@]([O-])(...,NCGC00260869-01
2,O=C1N2CCC3=C(NC4=C3C=CC=C4)C2=NC2=C1C=CC=C2,NCGC00261776-01
3,Cl.FC1=CC=C(C=C1)C(OCCCC1=CNC=N1)C1=CC=C(F)C=C1,NCGC00261380-01
4,CC1=CC=C(C=C1)S(=O)(=O)N[C@@H](CC1=CC=CC=C1)C(...,NCGC00261842-01


In [17]:
# read tox21 score labels
score_results = pd.read_csv('../raw_data/tox21_10k_challenge_score.txt', sep='\t')
score_results.head()

Unnamed: 0,Sample ID,NR-AhR,NR-AR,NR-AR-LBD,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,NCGC00261900-01,0,1,x,0,0,0,0,x,0,0,x,0
1,NCGC00260869-01,0,1,x,x,0,0,0,0,0,0,0,0
2,NCGC00261776-01,1,1,0,x,1,0,0,1,1,0,1,0
3,NCGC00261380-01,x,0,x,1,0,x,x,1,0,x,0,x
4,NCGC00261842-01,0,0,0,x,0,0,0,0,0,0,x,1


In [18]:
# pick out NR-AhR labels
score_results = score_results[['Sample ID', 'NR-AhR']]
score_results = score_results.rename(columns={'Sample ID': 'id', 'NR-AhR': 'label'})
score_results.head()

Unnamed: 0,id,label
0,NCGC00261900-01,0
1,NCGC00260869-01,0
2,NCGC00261776-01,1
3,NCGC00261380-01,x
4,NCGC00261842-01,0


In [19]:
# merge score dataset with NR-AhR labels using compound ID
nr_ahr_test = score.merge(score_results, on='id')
nr_ahr_test.head()

Unnamed: 0,compounds,id,label
0,OC(=O)\C=C/C(O)=O.C[C@]12CC=C3[C@@H](CCC4=CC(=...,NCGC00261900-01,0
1,[Na+].NC1=NC=NC2=C1N=C(Br)N2C1OC2CO[P@]([O-])(...,NCGC00260869-01,0
2,O=C1N2CCC3=C(NC4=C3C=CC=C4)C2=NC2=C1C=CC=C2,NCGC00261776-01,1
3,Cl.FC1=CC=C(C=C1)C(OCCCC1=CNC=N1)C1=CC=C(F)C=C1,NCGC00261380-01,x
4,CC1=CC=C(C=C1)S(=O)(=O)N[C@@H](CC1=CC=CC=C1)C(...,NCGC00261842-01,0


In [20]:
# check test dataset for duplicates
print('Number of compounds in raw dataset:', len(nr_ahr_test['compounds']))
print('Number of unique compounds:', len(nr_ahr_test['compounds'].unique()))

Number of compounds in raw dataset: 647
Number of unique compounds: 646


In [21]:
# remove duplicates
nr_ahr_test = nr_ahr_test.drop_duplicates(subset='compounds').reset_index(drop=True)
len(nr_ahr_test)

646

In [22]:
# check label column for inconclusive/tests not run (x)
nr_ahr_test.label.unique()

array(['0', '1', 'x'], dtype=object)

In [23]:
# remove rows with inconclusive/tests not run (x)
nr_ahr_test = nr_ahr_test[nr_ahr_test.label != 'x']
print(nr_ahr_test.label.unique())
print(len(nr_ahr_test))

['0' '1']
609


In [24]:
# generate duplicate-free training dataset csv file
#nr_ahr_test.to_csv('../processed_data/nr_ahr_test.csv')

In [25]:
# split into test_data and test_labels
test_data = nr_ahr_test['compounds']
test_labels = nr_ahr_test['label']

print(len(test_data))
print(len(test_labels))

609
609


### Standardize test data using MolVS

In [26]:
# identify rows with SMILES errors
test_error_rows = []

for i, row in enumerate(nr_ahr_test.compounds):
    try:
        standardize_smiles(row)
    except Exception as e: 
        print('Error at index {}: {!r}'.format(i, row))
        print(e)
        train_error_rows.append(i)

print()
print('Number of rows with errors:', len(test_error_rows))

RDKit ERROR: [10:35:47] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 10
RDKit ERROR: 
RDKit ERROR: [10:35:47] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 14
RDKit ERROR: 


Error at index 303: 'FC(F)(F)c1nc(c(C#N)c1Br)C1=CC=C(Cl)C=C1'
Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 10

Error at index 344: 'C1=CC=C(C=C1)c1nc2ccccc2n1'
Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 14


Number of rows with errors: 0


In [27]:
# add error rows that were missed by the for loop
test_error_rows = [303, 344]

In [28]:
# remove rows with SMILES errors
test_drop_rows = [nr_ahr_test.index[i] for i in test_error_rows]
print(len(test_drop_rows))

nr_ahr_test_std = nr_ahr_test.drop(test_drop_rows).reset_index(drop=True)

print(len(nr_ahr_test_std))

2
607


In [29]:
# standardize compound SMILES using MolVS
nr_ahr_test_std['std_compounds'] = nr_ahr_test_std.compounds.map(lambda x: standardize_smiles(x))
print(len(nr_ahr_test_std))
nr_ahr_test_std.head()

607


Unnamed: 0,compounds,id,label,std_compounds
0,OC(=O)\C=C/C(O)=O.C[C@]12CC=C3[C@@H](CCC4=CC(=...,NCGC00261900-01,0,C[C@]12C=CC(=O)C=C1CC[C@@H]1C2=CC[C@]2(C)[C@@H...
1,[Na+].NC1=NC=NC2=C1N=C(Br)N2C1OC2CO[P@]([O-])(...,NCGC00260869-01,0,Nc1ncnc2c1nc(Br)n2C1OC2CO[P@@](=O)([O-])O[C@@H...
2,O=C1N2CCC3=C(NC4=C3C=CC=C4)C2=NC2=C1C=CC=C2,NCGC00261776-01,1,O=c1c2ccccc2nc2n1CCc1c-2[nH]c2ccccc12
3,CC1=CC=C(C=C1)S(=O)(=O)N[C@@H](CC1=CC=CC=C1)C(...,NCGC00261842-01,0,Cc1ccc(S(=O)(=O)N[C@@H](Cc2ccccc2)C(=O)CCl)cc1
4,Cl.NC1=CC=C(C=C1)C1=NC2=CC=CC=C2C=C1,NCGC00261662-01,1,Cl.Nc1ccc(-c2ccc3ccccc3n2)cc1


In [30]:
# generate duplicate-free training dataset csv file
#nr_ahr_test_std.to_csv('../processed_data/nr_ahr_test_std.csv')

In [31]:
# split into test_data and test_labels
test_data_std = nr_ahr_test['compounds']
test_labels_std = nr_ahr_test['label']

print(len(test_data))
print(len(test_labels))

609
609
