In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from rdkit import Chem
%matplotlib inline

In [2]:
homedir = os.path.expanduser("~/")
homedir = homedir+"AIChem/chemnet/chemnet/data/"
df1 = pd.read_csv(homedir+"tox_niehs_all.csv")
df2 = pd.read_csv(homedir+"tox_niehs_ext_raw.csv")

# Functions for descriptor calculation

In [3]:
from rdkit.Chem import Descriptors

def compute_descriptors(mol, id_string):

    descriptors = [id_string]
    
    # Property descriptor
    descriptors.append(Descriptors.MolWt(mol))
    descriptors.append(Descriptors.HeavyAtomMolWt(mol))
    descriptors.append(Descriptors.MolLogP(mol))
    descriptors.append(Descriptors.MolMR(mol))
    descriptors.append(Descriptors.TPSA(mol))    
    # Constitutional descriptor
    descriptors.append(Descriptors.FractionCSP3(mol))
    # Atom
    descriptors.append(Descriptors.HeavyAtomCount(mol))
    descriptors.append(Descriptors.NHOHCount(mol))
    descriptors.append(Descriptors.NOCount(mol))
    descriptors.append(Descriptors.NumHAcceptors(mol))
    descriptors.append(Descriptors.NumHDonors(mol))    
    descriptors.append(Descriptors.NumHeteroatoms(mol))
    #descriptors.append(Descriptors.NumBridgeheadAtoms(mol))
    #descriptors.append(Descriptors.NumSpiroAtoms(mol))
    # Bond
    descriptors.append(Descriptors.NumRotatableBonds(mol))
    # Electronic
    descriptors.append(Descriptors.NumRadicalElectrons(mol))
    descriptors.append(Descriptors.NumValenceElectrons(mol))
    descriptors.append(Descriptors.MaxPartialCharge(mol))
    descriptors.append(Descriptors.MinPartialCharge(mol))
    descriptors.append(Descriptors.MaxAbsPartialCharge(mol))
    descriptors.append(Descriptors.MinAbsPartialCharge(mol))
    # Ring
    #descriptors.append(Descriptors.NumRings(mol))
    descriptors.append(Descriptors.NumAromaticRings(mol))
    descriptors.append(Descriptors.NumSaturatedRings(mol))    
    descriptors.append(Descriptors.NumAliphaticRings(mol))
    #descriptors.append(Descriptors.NumCarbocycles(mol))
    descriptors.append(Descriptors.NumAromaticCarbocycles(mol))
    descriptors.append(Descriptors.NumSaturatedCarbocycles(mol))  
    descriptors.append(Descriptors.NumAliphaticCarbocycles(mol))  
    #descriptors.append(Descriptors.NumHeterocycles(mol))
    descriptors.append(Descriptors.NumAromaticHeterocycles(mol))
    descriptors.append(Descriptors.NumSaturatedHeterocycles(mol)) 
    descriptors.append(Descriptors.NumAliphaticHeterocycles(mol))    
    # Functional Groups
    descriptors.append(Descriptors.fr_Al_COO(mol))
    descriptors.append(Descriptors.fr_Al_OH(mol))
    descriptors.append(Descriptors.fr_Al_OH_noTert(mol))
    descriptors.append(Descriptors.fr_ArN(mol))
    descriptors.append(Descriptors.fr_Ar_COO(mol))
    descriptors.append(Descriptors.fr_Ar_N(mol))
    descriptors.append(Descriptors.fr_Ar_NH(mol))
    descriptors.append(Descriptors.fr_Ar_OH(mol))
    descriptors.append(Descriptors.fr_COO(mol))
    descriptors.append(Descriptors.fr_COO2(mol))
    descriptors.append(Descriptors.fr_C_O(mol))
    descriptors.append(Descriptors.fr_C_O_noCOO(mol))
    descriptors.append(Descriptors.fr_C_S(mol))
    descriptors.append(Descriptors.fr_HOCCN(mol))
    descriptors.append(Descriptors.fr_Imine(mol))
    descriptors.append(Descriptors.fr_NH0(mol))
    descriptors.append(Descriptors.fr_NH1(mol))
    descriptors.append(Descriptors.fr_NH2(mol))
    descriptors.append(Descriptors.fr_N_O(mol))
    descriptors.append(Descriptors.fr_Ndealkylation1(mol))
    descriptors.append(Descriptors.fr_Ndealkylation2(mol))
    descriptors.append(Descriptors.fr_Nhpyrrole(mol))
    descriptors.append(Descriptors.fr_SH(mol))
    descriptors.append(Descriptors.fr_aldehyde(mol))
    descriptors.append(Descriptors.fr_alkyl_carbamate(mol))
    descriptors.append(Descriptors.fr_alkyl_halide(mol))
    descriptors.append(Descriptors.fr_allylic_oxid(mol))
    descriptors.append(Descriptors.fr_amide(mol))
    descriptors.append(Descriptors.fr_amidine(mol))
    descriptors.append(Descriptors.fr_aniline(mol))
    descriptors.append(Descriptors.fr_aryl_methyl(mol))
    descriptors.append(Descriptors.fr_azide(mol))
    descriptors.append(Descriptors.fr_azo(mol))
    descriptors.append(Descriptors.fr_barbitur(mol))
    descriptors.append(Descriptors.fr_benzene(mol))
    descriptors.append(Descriptors.fr_benzodiazepine(mol))
    descriptors.append(Descriptors.fr_bicyclic(mol))
    descriptors.append(Descriptors.fr_diazo(mol))
    descriptors.append(Descriptors.fr_dihydropyridine(mol))
    descriptors.append(Descriptors.fr_epoxide(mol))
    descriptors.append(Descriptors.fr_ester(mol))
    descriptors.append(Descriptors.fr_ether(mol))
    descriptors.append(Descriptors.fr_furan(mol))
    descriptors.append(Descriptors.fr_guanido(mol))
    descriptors.append(Descriptors.fr_halogen(mol))
    descriptors.append(Descriptors.fr_hdrzine(mol))
    descriptors.append(Descriptors.fr_hdrzone(mol))
    descriptors.append(Descriptors.fr_imidazole(mol))
    descriptors.append(Descriptors.fr_imide(mol))
    descriptors.append(Descriptors.fr_isocyan(mol))
    descriptors.append(Descriptors.fr_isothiocyan(mol))
    descriptors.append(Descriptors.fr_ketone(mol))
    descriptors.append(Descriptors.fr_ketone_Topliss(mol))
    descriptors.append(Descriptors.fr_lactam(mol))
    descriptors.append(Descriptors.fr_lactone(mol))
    descriptors.append(Descriptors.fr_methoxy(mol))
    descriptors.append(Descriptors.fr_morpholine(mol))
    descriptors.append(Descriptors.fr_nitrile(mol))
    descriptors.append(Descriptors.fr_nitro(mol))
    descriptors.append(Descriptors.fr_nitro_arom(mol))
    descriptors.append(Descriptors.fr_nitro_arom_nonortho(mol))
    descriptors.append(Descriptors.fr_nitroso(mol))
    descriptors.append(Descriptors.fr_oxazole(mol))
    descriptors.append(Descriptors.fr_oxime(mol))
    descriptors.append(Descriptors.fr_para_hydroxylation(mol))
    descriptors.append(Descriptors.fr_phenol(mol))
    descriptors.append(Descriptors.fr_phenol_noOrthoHbond(mol))
    descriptors.append(Descriptors.fr_phos_acid(mol))
    descriptors.append(Descriptors.fr_phos_ester(mol))
    descriptors.append(Descriptors.fr_piperdine(mol))
    descriptors.append(Descriptors.fr_piperzine(mol))
    descriptors.append(Descriptors.fr_priamide(mol))
    descriptors.append(Descriptors.fr_prisulfonamd(mol))
    descriptors.append(Descriptors.fr_pyridine(mol))
    descriptors.append(Descriptors.fr_quatN(mol))
    descriptors.append(Descriptors.fr_sulfide(mol))
    descriptors.append(Descriptors.fr_sulfonamd(mol))
    descriptors.append(Descriptors.fr_sulfone(mol))
    descriptors.append(Descriptors.fr_term_acetylene(mol))
    descriptors.append(Descriptors.fr_tetrazole(mol))
    descriptors.append(Descriptors.fr_thiazole(mol))
    descriptors.append(Descriptors.fr_thiocyan(mol))
    descriptors.append(Descriptors.fr_thiophene(mol))
    descriptors.append(Descriptors.fr_unbrch_alkane(mol))
    descriptors.append(Descriptors.fr_urea(mol))
    # MOE-type descriptors
    descriptors.append(Descriptors.LabuteASA(mol))
    descriptors.append(Descriptors.PEOE_VSA1(mol))
    descriptors.append(Descriptors.PEOE_VSA2(mol))
    descriptors.append(Descriptors.PEOE_VSA3(mol))
    descriptors.append(Descriptors.PEOE_VSA4(mol))
    descriptors.append(Descriptors.PEOE_VSA5(mol))
    descriptors.append(Descriptors.PEOE_VSA6(mol))
    descriptors.append(Descriptors.PEOE_VSA7(mol))
    descriptors.append(Descriptors.PEOE_VSA8(mol))
    descriptors.append(Descriptors.PEOE_VSA9(mol))
    descriptors.append(Descriptors.PEOE_VSA10(mol))
    descriptors.append(Descriptors.PEOE_VSA11(mol))
    descriptors.append(Descriptors.PEOE_VSA12(mol))
    descriptors.append(Descriptors.PEOE_VSA13(mol))
    descriptors.append(Descriptors.PEOE_VSA14(mol))
    descriptors.append(Descriptors.SMR_VSA1(mol))
    descriptors.append(Descriptors.SMR_VSA2(mol))
    descriptors.append(Descriptors.SMR_VSA3(mol))
    descriptors.append(Descriptors.SMR_VSA4(mol))
    descriptors.append(Descriptors.SMR_VSA5(mol))
    descriptors.append(Descriptors.SMR_VSA6(mol))
    descriptors.append(Descriptors.SMR_VSA7(mol))
    descriptors.append(Descriptors.SMR_VSA8(mol))
    descriptors.append(Descriptors.SMR_VSA9(mol))
    descriptors.append(Descriptors.SMR_VSA10(mol))
    descriptors.append(Descriptors.SlogP_VSA1(mol))
    descriptors.append(Descriptors.SlogP_VSA2(mol))
    descriptors.append(Descriptors.SlogP_VSA3(mol))
    descriptors.append(Descriptors.SlogP_VSA4(mol))
    descriptors.append(Descriptors.SlogP_VSA5(mol))
    descriptors.append(Descriptors.SlogP_VSA6(mol))
    descriptors.append(Descriptors.SlogP_VSA7(mol))
    descriptors.append(Descriptors.SlogP_VSA8(mol))
    descriptors.append(Descriptors.SlogP_VSA9(mol))
    descriptors.append(Descriptors.SlogP_VSA10(mol))
    descriptors.append(Descriptors.SlogP_VSA11(mol))
    descriptors.append(Descriptors.SlogP_VSA12(mol))
    descriptors.append(Descriptors.EState_VSA1(mol))
    descriptors.append(Descriptors.EState_VSA2(mol))
    descriptors.append(Descriptors.EState_VSA3(mol))
    descriptors.append(Descriptors.EState_VSA4(mol))
    descriptors.append(Descriptors.EState_VSA5(mol))
    descriptors.append(Descriptors.EState_VSA6(mol))
    descriptors.append(Descriptors.EState_VSA7(mol))
    descriptors.append(Descriptors.EState_VSA8(mol))
    descriptors.append(Descriptors.EState_VSA9(mol))
    descriptors.append(Descriptors.EState_VSA10(mol))
    descriptors.append(Descriptors.EState_VSA11(mol))
    descriptors.append(Descriptors.VSA_EState1(mol))
    descriptors.append(Descriptors.VSA_EState2(mol))
    descriptors.append(Descriptors.VSA_EState3(mol))
    descriptors.append(Descriptors.VSA_EState4(mol))
    descriptors.append(Descriptors.VSA_EState5(mol))
    descriptors.append(Descriptors.VSA_EState6(mol))
    descriptors.append(Descriptors.VSA_EState7(mol))
    descriptors.append(Descriptors.VSA_EState8(mol))
    descriptors.append(Descriptors.VSA_EState9(mol))
    descriptors.append(Descriptors.VSA_EState10(mol))   
    # Topological descriptors
    descriptors.append(Descriptors.BalabanJ(mol))
    descriptors.append(Descriptors.BertzCT(mol))
    descriptors.append(Descriptors.HallKierAlpha(mol))
    descriptors.append(Descriptors.Ipc(mol))
    descriptors.append(Descriptors.Kappa1(mol))
    descriptors.append(Descriptors.Kappa2(mol))
    descriptors.append(Descriptors.Kappa3(mol))    
    # Connectivity descriptors
    descriptors.append(Descriptors.Chi0(mol))
    descriptors.append(Descriptors.Chi1(mol))
    descriptors.append(Descriptors.Chi0n(mol))
    descriptors.append(Descriptors.Chi1n(mol))
    descriptors.append(Descriptors.Chi2n(mol))
    descriptors.append(Descriptors.Chi3n(mol))
    descriptors.append(Descriptors.Chi4n(mol))
    descriptors.append(Descriptors.Chi0v(mol))
    descriptors.append(Descriptors.Chi1v(mol))
    descriptors.append(Descriptors.Chi2v(mol))
    descriptors.append(Descriptors.Chi3v(mol))
    descriptors.append(Descriptors.Chi4v(mol))    
    # Other properties
    descriptors.append(Descriptors.qed(mol))
    
    return(descriptors)

# Compute descriptors on all data first

In [4]:
# Combine all datasets together (we need to do this for input normalization)

df = pd.concat([df1, df2])
print(df1.shape)
print(df2.shape)
print(df.shape)
# Reset index of df
df = df.reset_index(drop=True)   #VERY IMPORTANT: without reindex iteration has bugs
df.tail(5)

(8380, 17)
(2895, 19)
(11275, 21)


Unnamed: 0,CASRN,Canonical_QSARr,Chemical_Name,DTXSID,Extraneous_SMILES,InChI Key_QSARr,InChI_Code_QSARr,InChI_Key_QSARr,Name,RowID,...,Salt_Solvent,Structure_Source,epa,ghs,id,ld50,logld50,nontoxic,smiles,verytoxic
11270,71-43-2,C1C=CC=CC=1,,DTXSID3039242,,UHOVQNZJYSORNB-UHFFFAOYSA-N,InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H,,Benzene,3891.0,...,?,EPA_DSSTox,2.0,4.0,testid2891,3323.0,8.108623,1.0,c1ccccc1,0.0
11271,62-73-7,COP(=O)(OC=C(Cl)Cl)OC,,DTXSID5020449,,OEBRKCOSUFCWJD-UHFFFAOYSA-N,"InChI=1S/C4H7Cl2O4P/c1-8-11(7,9-2)10-3-4(5)6/h...",,Dichlorvos,3892.0,...,?,EPA_DSSTox,0.0,1.0,testid2892,30.0,3.401197,0.0,COP(=O)(OC)OC=C(Cl)Cl,1.0
11272,333-41-5,CC1=CC(=NC(=N1)C(C)C)OP(=S)(OCC)OCC,,DTXSID9020407,,FHIVAFMUCKRCQO-UHFFFAOYSA-N,"InChI=1S/C12H21N2O3PS/c1-6-15-18(19,16-7-2)17-...",,Diazinon,3893.0,...,?,EPA_DSSTox,1.0,2.0,testid2893,92.0,4.521789,0.0,CCOP(=S)(OCC)Oc1cc(C)nc(C(C)C)n1,0.0
11273,13674-84-5,CC(CCl)OP(=O)(OC(C)CCl)OC(C)CCl,,DTXSID5026259,,KVMPUXDNESXNOH-UHFFFAOYSA-N,"InChI=1S/C9H18Cl3O4P/c1-7(4-10)14-17(13,15-8(2...",,Tris(2-chloroisopropyl)phosphate,3894.0,...,?,EPA_DSSTox,2.0,3.0,testid2894,980.0,6.887553,0.0,CC(CCl)OP(=O)(OC(C)CCl)OC(C)CCl,0.0
11274,56-38-2,CCOP(=S)(OC1C=CC(=CC=1)[N+]([O-])=O)OCC,,DTXSID7021100,,LCCNCVORNKJIRZ-UHFFFAOYSA-N,"InChI=1S/C10H14NO5PS/c1-3-14-17(18,15-4-2)16-1...",,Parathion,3895.0,...,?,EPA_DSSTox,0.0,0.0,testid2895,2.1,0.741937,0.0,CCOP(=S)(OCC)Oc1ccc([N+](=O)[O-])cc1,1.0


In [5]:
# Compute descriptors for every sample

newdf = []

for index, row in df.iterrows():

    # Compute descriptors
    smiles_string = df['smiles'][index]
    id_string = df['id'][index]
    mol = Chem.MolFromSmiles(smiles_string)
    descriptors = compute_descriptors(mol, id_string) 
    
    # Append results
    newdf.append(descriptors)

In [6]:
# Convert descriptors to np array
all_new = np.asarray(newdf)

In [7]:
all_desc = all_new[:,1:].astype(float)
all_name = all_new[:,:1]
all_desc.shape

(11275, 191)

In [8]:
# Is removing missing rows (samples) or columns (descriptors) better?

In [9]:
# Checking rows
all_desc[~np.isnan(all_desc).any(axis=1)].shape

(11264, 191)

In [10]:
# Checking columns
all_desc[:,~np.any(np.isnan(all_desc), axis=0)].shape

(11275, 187)

In [11]:
# Removing descriptors with NaN
all_desc = all_desc[:,~np.any(np.isnan(all_desc), axis=0)]

In [12]:
# Minmax rescale descriptors
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
scaler = MinMaxScaler(feature_range=(0, 1))
all_desc_minmax = scaler.fit_transform(all_desc)

In [13]:
# Standardize scale descriptors
all_desc_std = StandardScaler().fit_transform(all_desc)

  x = um.multiply(x, x, out=x)


In [14]:
# Robust scale descriptors
scaler = RobustScaler(quantile_range=(25, 75))
all_desc_robust = scaler.fit_transform(all_desc)

In [15]:
# Namelist of df for merging
namelist = np.arange(all_desc.shape[1]).tolist()
namelist.insert(0, 'id')

In [16]:
all_combined = np.concatenate((all_name, all_desc_minmax), axis=1)
final_df = pd.DataFrame(np.asarray(all_combined), columns=namelist)

In [17]:
final_df.to_csv(homedir+"tox_niehs_desc_minmax.csv", index=False)
final_df.head(5)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,177,178,179,180,181,182,183,184,185,186
0,molid12,0.0198342247222056,0.0193841704879521,0.4094591967444551,0.0198356886266641,0.001198043196113,1.0,0.0167064439140811,0.0,0.0064102564102564,...,0.0237686932236384,0.01741181644934,0.020799987456729,0.0165015052279534,0.0268768568171896,0.0267727371966454,0.0205837405700672,0.0225306664894868,0.0214523782301508,0.5558341738761953
1,molid52,0.0129481772729753,0.0134622767563591,0.3874553991891376,0.013693590508226,0.0143876113459127,0.0,0.0143198090692124,0.0202020202020202,0.0128205128205128,...,0.0155596575120842,0.0129944072554591,0.0113397850515272,0.009095805676733,0.0160768200206668,0.0149698569234307,0.0125886308412582,0.0108653050666969,0.0086784605622745,0.538065134310418
2,molid63,0.0094690822755052,0.0088570870768125,0.3893994902486087,0.0107597926018575,0.0096213222107594,1.0,0.0095465393794749,0.0202020202020202,0.0064102564102564,...,0.0153846196042669,0.014059749734774,0.0103416226634848,0.0,0.0161935704106108,0.0148014539599229,0.0136207058661479,0.0099089078507971,0.0,0.5102648118068442
3,molid70,0.0157433277681302,0.0168022694250132,0.4194646879490265,0.0118641566215706,0.0,0.3333333333333333,0.0095465393794749,0.0,0.0,...,0.0093176222162934,0.0052360698480104,0.0028614152776628,0.0008426910236399,0.0173164708192133,0.0160578757680562,0.0105834681978174,0.0082250632899237,0.0072362305960369,0.475104095315631
4,molid256,0.0122783295899368,0.0127473731851511,0.357928602646794,0.0078790759402591,0.021272662059377,0.6666666666666666,0.0119331742243436,0.0202020202020202,0.0192307692307692,...,0.0117846916058263,0.0102917528933235,0.0057395632224013,0.0,0.0141733339997495,0.0113379839555571,0.0099703722791258,0.0054994080644047,0.0,0.475551198304915
