## EnviroTox

### Set up

upload envirotox data 

upload dictionary

xml

In [None]:
file_ecotox='envirotox_20220725142944.xlsx'
file_dict='DictionaryEcosarCategories.txt'
dest_env_all='envirotoxall.csv'
dest_env_single='envirotoxRT.csv'
xml_file = 'descriptors.xml'

In [None]:
!pip install pubchempy

In [None]:
!pip install padelpy

In [None]:
import pandas as pd
import json
import numpy as np
import pubchempy as pcp
from padelpy import padeldescriptor
from scipy.stats import gmean
from sklearn.model_selection import train_test_split

In [None]:
df_test=pd.read_excel(file_ecotox, sheet_name=0) # experiments
df_chem=pd.read_excel(file_ecotox, sheet_name=1) # chemicals

In [None]:
with open(file_dict) as f:
    dict_text = f.read()
dict_collapsed = json.loads(dict_text)

In [None]:
# drop rows associated to metalgroup
df_test=df_test[~df_test.CAS.astype(str).str.contains("Metalgrp")]
df_chem=df_chem[~df_chem.CAS.astype(str).str.contains("Metalgrp")]

In [None]:
# function removing the compounds with ecosar class inorganic compound and translate Ecosar category to collapsed ecosar category
def split_and_translate_and_filter(class_eco):
  try:
    # split ecosar classes if multiple categories given and return 'Inorganic Compound' if one of the categories
    class_eco=[x.strip() for x in class_eco.replace('|', '; ').split(';')]
    if 'Inorganic Compound' in class_eco:
      return 'Inorganic Compound'
    else:
      # use first entry (follwoing: EnviroTox database guide) 
      # disregard case sensitivity
      class_eco=class_eco[0].upper()
      # translate to collapsed category using dictionary
      collapsed_class=dict_collapsed.get(class_eco)
      return collapsed_class
  except:
    # if already NaN
    return np.NaN

In [None]:
# map ECOSAR classification to the collapsed category according to the dictionary defined and save in new column
df_chem['ECOSAR classification – collapsed']=df_chem['ECOSAR classification'].map(split_and_translate_and_filter)

In [None]:
# merge exposure test and chemical data frame
df_combined=df_test.merge(df_chem, left_on='CAS', right_on='CAS', how='left')

In [None]:
# remove inorganics and heavy metals
df_combined=df_combined[(df_combined['Heavy Metals']==0.0) & (df_combined['ECOSAR classification – collapsed']!='Inorganic Compound')]

In [None]:
# keep only experiment duration 96h
df_combined= df_combined[df_combined['Duration (hours)']==96.0]

In [None]:
df_combined= df_combined[['CAS', 'Chemical name_x', 'Latin name', 'Effect value', 'Test statistic','Duration (hours)',
       'Effect is 5X above water solubility', 'Canonical SMILES']]

In [None]:
df_combined.rename(columns={"Chemical name_x": "Chemical_name", "Effect value": "Effect_value_(mgL-1)"}, inplace=True)

In [None]:
df_combined.to_csv(dest_env_all)

In [None]:
df_single=df_combined[df_combined['Latin name']=='Oncorhynchus mykiss']

In [None]:
df_single.to_csv(dest_env_single)

### PubChem

In [None]:
def pubchem_get_compounds(df):
  dicts_pc = {}
  error_cas=[]
  keys = df['Canonical SMILES'].unique()
  for i in keys:
          try:
            c = pcp.get_compounds(i,'smiles')
            if len(c)!=1:
              #print(i, c)
              error_cas.append(i)
            else:
              c=c[0]
              value=[c.cid,c.iupac_name,c.canonical_smiles]
              dicts_pc[i] = value
          except:
            error_cas.append(i)
  return dicts_pc, error_cas

In [None]:
dict_combined, error_cas_combined=pubchem_get_compounds(df_combined)
dict_single, error_cas_single=pubchem_get_compounds(df_single)

In [None]:
df_combined_pub=pd.DataFrame.from_dict(dict_combined,orient = 'index', columns=['cid','iupac_name','canonical_smiles'])

In [None]:
df_single_pub=pd.DataFrame.from_dict(dict_single,orient = 'index', columns=['cid','iupac_name','canonical_smiles'])

In [None]:
def pubchem_get_sdf(df,dest_name):
  cids=df.cid.unique()
  !mkdir $dest_name
  for cid in cids:
    c=str(cid).split('.')[0]
    url='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + c +'/SDF?record_type=3d'
    dest=dest_name+'/'+c+'.sdf'
    !wget -O $dest $url
  



In [None]:
pubchem_get_sdf(df_combined_pub, 'combined_envirotox_sdf')

In [None]:
!zip -r /content/combined_envirotox_sdf_dir.zip /content/combined_envirotox_sdf

In [None]:
pubchem_get_sdf(df_single_pub, 'single_envirotox_sdf')

In [None]:
!zip -r /content/single_envirotox_sdf_dir.zip /content/single_envirotox_sdf

In [None]:
padeldescriptor(mol_dir='combined_envirotox_sdf', d_file='envirotox_combined_input.csv', d_2d=True, d_3d=True, fingerprints =True, descriptortypes=xml_file, retainorder=True, log=True, maxruntime=100000) 
# call the function padeldescriptors for test
padeldescriptor(mol_dir='single_envirotox_sdf', d_file='envirotox_single_input.csv', d_2d=True, d_3d=True, fingerprints =True, descriptortypes=xml_file, retainorder=True, log=True, maxruntime=100000) 

In [None]:
df_combined_pub['cs']=df_combined_pub.index

In [None]:
df_combined_meta=pd.merge(df_combined_pub, df_combined, left_on='cs', right_on='Canonical SMILES')[['cid','iupac_name','cs',	'CAS',	'Chemical_name',	'Latin name',	'Effect_value_(mgL-1)',	'Test statistic',	'Duration (hours)',	'Effect is 5X above water solubility']]

In [None]:
df_single_pub['cs']=df_single_pub.index

In [None]:
df_single_meta=pd.merge(df_single_pub, df_single, left_on='cs', right_on='Canonical SMILES')[['cid','iupac_name','cs',	'CAS',	'Chemical_name',	'Latin name',	'Effect_value_(mgL-1)',	'Test statistic',	'Duration (hours)',	'Effect is 5X above water solubility']]

In [None]:
df_combined_meta.to_csv('df_combined_meta_envtox.csv')
df_single_meta.to_csv('df_single_meta_envtox.csv')

In [None]:
df_combined_meta= pd.read_csv('df_combined_meta_envtox.csv', index_col=0)
df_single_meta= pd.read_csv('df_single_meta_envtox.csv', index_col=0)

### Target
Classes

Multiple instances:

Class majority vote

Regression geometric mean

In [None]:
def majority_vote_class(df, multiple_species):
  sort_order =  ['Very highly toxic', 'Highly toxic', 'Moderately toxic', 'Slightly toxic','Nontoxic' ]
  # more toxic category prefered if same count
  if multiple_species:
    df = df.groupby(['cs','cid','Latin name'])['Classification'].agg(lambda x: sorted(pd.Series.mode(x),key=lambda val: sort_order.index(val))[0]).to_frame()
  else:
    df = df.groupby(['cs','cid'])['Classification'].agg(lambda x: sorted(pd.Series.mode(x),key=lambda val: sort_order.index(val))[0]).to_frame()
  
  return df


In [None]:
def geometric_mean_target(df, multiple_species):
  if multiple_species:
    df = df.groupby(['cs','cid','Latin name'])['Effect_value_(mgL-1)'].apply(gmean) # all same unit: mg/L
  else:
    df = df.groupby(['cs','cid'])['Effect_value_(mgL-1)'].apply(gmean)
  return df

In [None]:
def set_class_targets(df, multiple_species=False):
    #https://www.epa.gov/pesticide-science-and-assessing-pesticide-risks/technical-overview-ecological-risk-assessment-0
    conditions = [
    (df['Effect_value_(mgL-1)'] < 0.1),
    (df['Effect_value_(mgL-1)'] >= 0.1) & (df['Effect_value_(mgL-1)'] <= 1),
    (df['Effect_value_(mgL-1)'] > 1) & (df['Effect_value_(mgL-1)'] <= 10),
    (df['Effect_value_(mgL-1)'] > 10) & (df['Effect_value_(mgL-1)'] <= 100),
    (df['Effect_value_(mgL-1)'] > 100)
    ] 
    values = ['Very highly toxic', 'Highly toxic', 'Moderately toxic', 'Slightly toxic','Nontoxic' ]

    df_reg=geometric_mean_target(df, multiple_species)
    df['Classification'] = np.select(conditions, values)

    df = majority_vote_class(df, multiple_species)


    #binary classification
    df['classification_binary'] = np.where(
        (df['Classification'] == 'Nontoxic') | (df['Classification'] == 'Slightly toxic'), 0, 1) # binary
    
    #ternary classification
    df['classification_ternary']=np.nan
    df['classification_ternary'].loc[(df['Classification'] == 'Nontoxic') | (df['Classification'] == 'Slightly toxic')] = 0
    df['classification_ternary'].loc[(df['Classification'] == 'Moderately toxic')] = 1
    df['classification_ternary'].loc[(df['Classification'] == 'Highly toxic') | (df['Classification'] == 'Very highly toxic')] = 2
    
    return df, df_reg



In [None]:
df_combined_class,df_combined_reg=set_class_targets(df_combined_meta, multiple_species=True)
df_combined_target=pd.merge(df_combined_class,df_combined_reg, on=['cs','cid','Latin name'])

In [None]:
df_single_class,df_single_reg=set_class_targets(df_single_meta)
df_single_target=pd.merge(df_single_class,df_single_reg, on=['cs','cid'])

### Combine input and target

In [None]:
df_combined_input=pd.read_csv('envirotox_combined_input.csv')

In [None]:
df_single_input=pd.read_csv('envirotox_single_input.csv')

In [None]:
df_combined_all=pd.merge(df_combined_target.reset_index(),df_combined_input, left_on='cid', right_on='Name', how='left')

In [None]:
df_single_all=pd.merge(df_single_target.reset_index(),df_single_input, left_on='cid', right_on='Name')

In [None]:
df_single_all.to_csv('df_single_envtox_all.csv')

### Add identifier of target species

In [None]:
df_combined_all['target_species'] = 0
df_combined_all.loc[df_combined_all['Latin name'] =='Oncorhynchus mykiss', 'target_species'] = 1

In [None]:
df_combined_all.to_csv('df_combined_envtox_all.csv')

### Train test split

20-80 to have comparison with li and Jimeng Wu for etc.
The test split used for combined is the same as for the single species

In [None]:
df_single_train, df_single_test = train_test_split(df_single_all, test_size=0.2, random_state=42)

In [None]:
test_compounds_single=df_single_test.cs.values

In [None]:
df_combined_train=df_combined_all[~((df_combined_all['target_species']==1) & (df_combined_all.cs.isin(test_compounds_single)))]

In [None]:
df_combined_test=df_combined_all[(df_combined_all['target_species']==1) & (df_combined_all.cs.isin(test_compounds_single))]

In [None]:
df_combined_train.to_csv('envtox_combined_train.csv')
df_combined_test.to_csv('envtox_combined_test.csv')
df_single_train.to_csv('envtox_single_train.csv')
df_single_test.to_csv('envtox_single_test.csv')