In [50]:
import pandas as pd
import numpy as np
from rdkit import Chem
import molvs as mv
import pubchempy as pcp

In [51]:
import time
from datetime import timedelta
start = time.time()

In [52]:
#Toxcast Data set is available at: https://www.epa.gov/chemical-research/toxicity-forecaster-toxcasttm-data
#  Download ToxCast Summary Information -> INVITRODB_V2_SUMMARY.zip -> oldstyle_ac50_Matrix_151020.csv
toxcast_df = pd.read_csv('/ihome/gidakwo/ml_files/SMILES/ToxCast/oldstyle_ac50_Matrix_151020.csv')

#Here, CAS already converted to SMILES
#toxcast_deepchem = pd.read_csv('http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/toxcast_data.csv.gz')

In [53]:
def pubchem_cid_to_SMILES(cid):
    try:
        cpd = pcp.Compound.from_cid(cid)
        return cpd.isomeric_smiles
    except:
        return 'Failed'

In [54]:
toxcast_df['cid'] = toxcast_df['Unnamed: 0'].map(lambda x: x.lstrip('C'))
toxcast_df['smiles'] = toxcast_df['cid'].apply(lambda x: pubchem_cid_to_SMILES(x))

print(toxcast_df[toxcast_df['smiles'] == 'Failed'].shape[0]) #number of failed smile conversion

839


In [55]:
toxcast_df.head()

Unnamed: 0.1,Unnamed: 0,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_1h_dn,APR_HepG2_CellCycleArrest_1h_up,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellCycleArrest_72h_up,APR_HepG2_CellLoss_1h_dn,...,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up,cid,smiles
0,C100005,1000000.0,1000000.0,,,,,,,,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,100005,C1=CC=C(C=C1)CC(CO)NC(=O)C2=CC=CC=C2
1,C1000051,,,,,,,,,,...,,,,,,,,,1000051,C1=CC(=CC(=C1)CN2C=C(C=N2)[N+](=O)[O-])C(=O)NN...
2,C10001135,,,,,,,,,,...,,,,,,,,,10001135,C1CCC(C1)N.N.[OH-].[OH-].[Cl-].[Cl-].[Pt+2]
3,C100016,12.758208,1000000.0,,,1000000.0,1000000.0,1000000.0,1000000.0,,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,100016,C1=CN(C(=O)N=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)C...
4,C100027,1000000.0,1000000.0,,,1000000.0,1000000.0,1000000.0,1000000.0,,...,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,100027,CC1(C2CC(C3(C(C24COC(=O)CC4O1)CCC5(C36C(O6)C(=...


In [None]:
#Function to get parent of a smiles
#Modified from: https://www.wildcardconsulting.dk/useful-information/a-deep-tox21-neural-network-with-rdkit-and-keras/
def get_parent_smile(smile):
    try:
        st = mv.Standardizer()
        mol = Chem.MolFromSmiles(smile)
        smts = Chem.MolFromSmarts("[!#1&!#5&!#6&!#7&!#8&!#9&!#14&!#15&!#16&!#17&!#34&!#35&!#53]~*")
        if mol.HasSubstructMatch(smts) == False:
            mols = st.charge_parent(mol)
            return Chem.MolToSmiles(mols)
        else:
            #print(smile)
            return 'problematic'
    except:
        return 'NaN'
    
#Clean and standardize the data
def clean_data(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(get_parent_smile)
    data = data[~(data['smiles_parent'] == "NaN")]
    return data

#Generate InchiKey
def inchikey_gen(smile):
    try:
        m = Chem.MolFromSmiles(smile)
        m_ = Chem.MolToInchi(m)
        m__ = Chem.InchiToInchiKey(m_)
        return m__
    except:
        return 'Failed'

In [None]:
toxcast_df = clean_data(toxcast_df)

In [None]:
pCompounds = toxcast_df[toxcast_df['smiles_parent']=='problematic']['smiles'] #Problematic compounds
print(len(pCompounds))

In [None]:
toxcast_df = toxcast_df[~(toxcast_df['smiles_parent'] == 'problematic')] #Remove problematic compounds
toxcast_df['InchiKey'] = toxcast_df['smiles_parent'].apply(inchikey_gen)

In [None]:
elapsed = (time.time() - start)
print(str(timedelta(seconds=elapsed)))