In [19]:
import os
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
import molvs as mv

In [70]:
#INITIAL TRAINING DATASET
combinedDF = pd.DataFrame(columns=['SMILES','ID'])

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.smiles'):
        df = pd.read_csv(filename, header=None, sep='\t')
        df.columns=['SMILES','ID',filename[:-7]]
        combinedDF = combinedDF.merge(df, on=['SMILES', 'ID'], how='outer')

In [71]:
combinedDF.shape

(11764, 14)

In [72]:
combinedDF.head()

Unnamed: 0,SMILES,ID,nr-ar-lbd,sr-are,nr-er,nr-ar,nr-er-lbd,sr-mmp,sr-hse,nr-ppar-gamma,sr-p53,sr-atad5,nr-ahr,nr-aromatase
0,Cl.CC(NCCC(C1=CC=CC=C1)C2=CC=CC=C2)C3=CC=CC=C3,NCGC00094891-01,0.0,,,0.0,,1.0,,,,,,
1,OC1=C([Hg]Cl)C=CC=C1,NCGC00181158-01,1.0,,,0.0,1.0,,,,,,,
2,FC(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F.CCCCCCCC[N+](C)(CCCCCCCC)CCCCCCCC,NCGC00258157-01,0.0,0.0,,0.0,,,,,,,,
3,COC1=CC=C(CCN2CCC(CC2)NC3=NC4=CC=CC=C4N3CC5=CC=C(F)C=C5)C=C1,NCGC00016913-08,0.0,,,0.0,0.0,,,,,,,
4,Cl.CCCOC1=C(Br)C(C)=C(S1)C(=O)N2CCC(CC2)C3=CC(CN)=CC=C3F,NCGC00254244-01,1.0,,,0.0,,,,,,0.0,,


In [28]:
#df_lead = pd.read_csv('.../.../tox21_10k_challenge_test.smiles', sep='\t')
#df_lead.head()

In [61]:
#THE FINAL TEST SET
df_score_smiles = pd.read_csv('/.../.../tox21_10k_challenge_score.smiles', sep='\t')
df_score_results = pd.read_csv('/.../.../Tox21_Restart/tox21_10k_challenge_score.txt', sep='\t')

df_test = pd.concat([df_score_smiles, df_score_results]).groupby('Sample ID', as_index=False, sort=False).first().fillna('NA')

df_test = df_test.drop('Sample ID', axis=1).replace('x', np.nan)
df_test.rename(columns={'#SMILES': 'smiles'}, inplace=True)
df_test.shape

(647, 13)

In [25]:
df_test.head()

Unnamed: 0,smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,OC(=O)\C=C/C(O)=O.C[C@]12CC=C3[C@@H](CCC4=CC(=O)C=C[C@]34C)[C@@H]1CC[C@@H]2C(=O)CN1CCN(CC1)C1=NC(=NC(=C1)N1CCCC1)N1CCCC1,1,,0.0,0.0,0,0.0,0.0,,0,0.0,,0.0
1,[Na+].NC1=NC=NC2=C1N=C(Br)N2C1OC2CO[P@]([O-])(=O)O[C@@H]2C1O,1,,0.0,,0,0.0,0.0,0.0,0,0.0,0.0,0.0
2,O=C1N2CCC3=C(NC4=C3C=CC=C4)C2=NC2=C1C=CC=C2,1,0.0,1.0,,1,0.0,0.0,1.0,1,0.0,1.0,0.0
3,Cl.FC1=CC=C(C=C1)C(OCCCC1=CNC=N1)C1=CC=C(F)C=C1,0,,,1.0,0,,,1.0,0,,0.0,
4,CC1=CC=C(C=C1)S(=O)(=O)N[C@@H](CC1=CC=CC=C1)C(=O)CCl,0,0.0,0.0,,0,0.0,0.0,0.0,0,0.0,,1.0


In [65]:
#LEADERBOARD DATA SET
df_leaderbd = PandasTools.LoadSDF('/.../.../tox21_10k_challenge_test.sdf')

#Drop duplicates by ID
#df_train_lead = pd.concat([df_train, df_leaderbd])
#df_train_lead = df_train_lead.drop_duplicates(['ID'], keep = 'first')
#tox21_df.info()

#Select only useful columns: assays and the ligands(ROMol) total = 13
keep_cols = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
            'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53', 'ROMol']
df_leaderbd = df_leaderbd[keep_cols]

#Convert Mol to smiles
df_leaderbd['smiles'] = df_leaderbd['ROMol'].apply(Chem.MolToSmiles)
df_leaderbd = df_leaderbd.drop('ROMol', axis=1)

cols = df_leaderbd.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_leaderbd = df_leaderbd[cols]

In [22]:
df_leaderbd.head()

Unnamed: 0,smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,CNc1ncnc2c1ncn2C1OC(CO)C(O)C1O,0,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,Oc1cc(O)cc(C=Cc2ccc(O)c(O)c2)c1,0,0.0,,,1.0,,0.0,,1.0,,,1
2,COc1ccc2c(c[n+](C)c3c4cc5c(cc4ccc23)OCO5)c1OC.[Cl-],0,,1.0,,0.0,,,1.0,,1.0,,0
3,Br.Cc1onc(O)c1CC(N)C(=O)O,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,Nc1nc(N)c(N=O)c(OCC2CCCCC2)n1,1,0.0,0.0,0.0,,0.0,0.0,1.0,1.0,0.0,,0


In [27]:
df_leaderbd.shape

(295, 13)

In [29]:
#STANDARDIZATION STEP
#Function to get parent of a smiles
def get_parent_smile(smile):
    try:
        st = mv.Standardizer()
        mol = Chem.MolFromSmiles(smile)
        smts = Chem.MolFromSmarts("[!#1&!#5&!#6&!#7&!#8&!#9&!#14&!#15&!#16&!#17&!#34&!#35&!#53]~*")
        if mol.HasSubstructMatch(smts) == False:
            mols = st.charge_parent(mol)
            return Chem.MolToSmiles(mols)
        else:
            #print(smile)
            return 'problematic'
    except:
        return 'NaN'
    
#Clean and standardize the data
def clean_data(data):
    #remove missing smiles
    data = data[~(data['smiles'].isnull())]
    
    #Standardize and get parent with molvs
    data["smiles_parent"] = data.smiles.apply(get_parent_smile)
    data = data[~(data['smiles_parent'] == "NaN")]
    
    #Identifiy compounds that fail SMILE conversion
    pCompounds = data[data['smiles_parent']=='problematic']['smiles']
    data = data[~(data['smiles_parent'] == 'problematic')]
    print(len(pCompounds))
    
    return data

#Generate Unique InchiKey for Identification purpose
def inchikey_gen(smile):
    try:
        m = Chem.MolFromSmiles(smile)
        m_ = Chem.MolToInchi(m)
        m__ = Chem.InchiToInchiKey(m_)
        return m__
    except:
        return 'Failed'

def remove_pc(data):
    #Remove compunds that failed SMILE Conversion
    pCompounds = data[data['smiles_parent']=='problematic']['smiles']
    data = data[~(data['smiles_parent'] == 'problematic')]
    print(len(pCompounds))
    return data

In [73]:
#df_train = combinedDF.rename(columns={'SMILES': 'smiles'})
df_train.columns = df_train.columns.str.lower()
df_train = clean_data(df_train)

0


In [74]:
df_train.shape

(11628, 15)

In [66]:
df_leaderbd = clean_data(df_leaderbd)

3


In [62]:
df_test = clean_data(df_test)

14


In [63]:
#Drop duplicates by majority rule
#If a compound is duplicated, record the most occuring activity, if the activities for the dupplicates occur same number
#of times e.g. 4 duplicates with 2 active and 2 inactives, such compound (all its duplicates) should be considered 
#ambiguous and removed. If no duplicate, leave the compound.

def moder(x):
    m = pd.Series.mode(x)
    if len(m) == 1: 
        return m


#assays = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
assays = ['nr-ar', 'nr-ar-lbd', 'nr-ahr', 'nr-aromatase', 'nr-er', 'nr-er-lbd', 'nr-ppar-gamma', 'sr-are', 'sr-atad5', 'sr-hse', 'sr-mmp', 'sr-p53']
def dedup_majority_rule(data):
    dataDF = pd.DataFrame(columns=['smiles_parent'])
    for i in assays:
        a = data[['smiles_parent', i]]
        res = a.groupby('smiles_parent')[i].apply(moder).reset_index(level=1, drop=True).reset_index()
        dataDF = dataDF.merge(res, on=['smiles_parent'], how='outer')
    return dataDF

    #res.head()
#combinedDF

In [None]:
#testDF = df_test.rename(columns = {'smiles_parent': 's_p'})
df_test.columns = df_test.columns.str.lower()
testDF = dedup_majority_rule(df_test)
testDF.shape

In [None]:
df_leaderbd.columns = df_leaderbd.columns.str.lower()
leaderbdDF = dedup_majority_rule(df_leaderbd)
leaderbdDF.shape

In [75]:
trainDF = dedup_majority_rule(df_train)
trainDF.shape

(7396, 13)

In [None]:
#Concat trainDF and leaderbdDF the training data
#Retain testDF as test data set

#trainDF.to_csv('trainDF.csv')