In [1]:
import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit import Chem
import molvs as mv

In [2]:
import time
from datetime import timedelta
start = time.time()

In [3]:
#All dataset are from: https://tripod.nih.gov/tox21/challenge/data.jsp
#Training and Leaderboard Datasets
df_train = PandasTools.LoadSDF('/ihome/gidakwo/ml_files/SMILES/Tox21/Tox21_sdf/tox21_10k_data_all.sdf')
df_leaderbd = PandasTools.LoadSDF('/ihome/gidakwo/ml_files/SMILES/Tox21/Tox21_sdf/tox21_10k_challenge_test.sdf')

df_train_lead = pd.concat([df_train, df_leaderbd])

#tox21_df.info()
keep_cols = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
            'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53', 'ROMol']
df_train_lead = df_train_lead[keep_cols]

df_train_lead['smiles'] = df_train_lead['ROMol'].apply(Chem.MolToSmiles)
df_train_lead = df_train_lead.drop('ROMol', axis=1)

In [4]:
#THE FINAL TEST SET
df_score_smiles = pd.read_csv('/ihome/gidakwo/ml_files/SMILES/Tox21/Tox21_sdf/tox21_10k_challenge_score.smiles', sep='\t')
df_score_results = pd.read_csv('/ihome/gidakwo/ml_files/SMILES/Tox21/Tox21_sdf/tox21_10k_challenge_score.txt', sep='\t')

df_test = pd.concat([df_score_smiles, df_score_results]).groupby('Sample ID', as_index=False, sort=False).first().fillna('NA')

df_test = df_test.drop('Sample ID', axis=1).replace('x', np.nan)
df_test.rename(columns={'#SMILES': 'smiles'}, inplace=True)

In [5]:
df_tox21 = pd.concat([df_train_lead, df_test]).reset_index(drop=True)

In [6]:
#Function to get parent of a smiles
#Modified from: https://www.wildcardconsulting.dk/useful-information/a-deep-tox21-neural-network-with-rdkit-and-keras/
def get_parent_smile(smile):
    try:
        st = mv.Standardizer()
        mol = Chem.MolFromSmiles(smile)
        smts = Chem.MolFromSmarts("[!#1&!#5&!#6&!#7&!#8&!#9&!#14&!#15&!#16&!#17&!#34&!#35&!#53]~*")
        if mol.HasSubstructMatch(smts) == False:
            mols = st.charge_parent(mol)
            return Chem.MolToSmiles(mols)
        else:
            #print(smile)
            return 'problematic'
    except:
        return 'NaN'
    
#Clean and standardize the data
def clean_data(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(get_parent_smile)
    data = data[~(data['smiles_parent'] == "NaN")]
    return data

#Generate InchiKey
def inchikey_gen(smile):
    try:
        m = Chem.MolFromSmiles(smile)
        m_ = Chem.MolToInchi(m)
        m__ = Chem.InchiToInchiKey(m_)
        return m__
    except:
        return 'Failed'

In [7]:
df_tox21 = clean_data(df_tox21)

In [8]:
pCompounds = df_tox21[df_tox21['smiles_parent']=='problematic']['smiles']
print(len(pCompounds))

df_tox21 = df_tox21[~(df_tox21['smiles_parent'] == 'problematic')]

142


In [9]:
df_tox21['InchiKey'] = df_tox21['smiles_parent'].apply(inchikey_gen)

In [10]:
elapsed = (time.time() - start)
print(str(timedelta(seconds=elapsed)))

0:00:57.345032


In [11]:
df_tox21.head(5)

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,smiles,smiles_parent,InchiKey
0,,,,,,,,,,0.0,,,C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N)cc3nc2c1.[Cl-],C[n+]1c2cc(N)ccc2cc2ccc(N)cc21,XSIOKTWDEOJMGG-UHFFFAOYSA-O
1,,,,,,,,,,0.0,,,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(Br)c([O-])c(Br)cc12.[Na+].[Na+],O=C(O)c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(Br)c(O)c(Br)cc12,AZXGXVQWEUFULR-UHFFFAOYSA-N
2,,,,,,,,,,0.0,,,COC1CC(OC2CC(C3OC(C)(O)C(C)CC3C)OC2C2(C)CCC(C3(C)CCC4(CC(O)C(C)C(C(C)C5OC(O)(CC(=O)[O-])C(C)C(OC)C5OC)O4)O3)O2)OC(C)C1OC.[NH4+],COC1CC(OC2CC(C3OC(C)(O)C(C)CC3C)OC2C2(C)CCC(C3(C)CCC4(CC(O)C(C)C(C(C)C5OC(O)(CC(=O)O)C(C)C(OC)C5OC)O4)O3)O2)OC(C)C1OC,RWVUEZAROXKXRT-UHFFFAOYSA-N
3,,,,,,,,,,1.0,,,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)cc1.CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)cc1.O=C(O)C(=O)O.O=C([O-])C(=O)O.O=C([O-])C(=O)O,CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)cc1,VFCNQNZNPKRXIT-UHFFFAOYSA-N
4,0.0,,,,,,,,,,,,CC(=O)O.CCNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C(CC(C)C)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(CO)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1c[nH]cn1)NC(=O)C1CCC(=O)N1,CCNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C(CC(C)C)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(CO)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1c[nH]cn1)NC(=O)C1CCC(=O)N1,GJKXGJCSJWBJEZ-UHFFFAOYSA-N
