In [None]:
!wget -c https://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh
!chmod +x Miniconda3-py37_4.8.3-Linux-x86_64.sh
!time bash ./Miniconda3-py37_4.8.3-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from sklearn.ensemble import RandomForestRegressor

# Homework: Random Forest Classification

## Tox21: Toxicity in the 21st Century Challenge

Wu, Z., Ramsundar, B., Feinberg, E.N., Gomes, J., Geniesse, C., Pappu, A.S., Leswing, K. and Pande, V., <a href='https://pubs.rsc.org/--/content/articlehtml/2018/sc/c7sc02664a'>MoleculeNet: a benchmark for molecular machine learning</a>. Chemical Science, 9(2), 513-530, 2018.

### qHTS assay to identify small molecule agonists of the estrogen receptor alpha (ER-alpha) signaling pathway using the BG1 cell line

Estrogen receptor (ER), a nuclear hormone receptor, plays an important role in development, metabolic homeostasis and reproduction. Endocrine disrupting chemicals (EDCs) and their interactions with steroid hormone receptors like ER causes disruption of normal endocrine function. Therefore, it is important to understand the effect of environmental chemicals on the ER signaling pathway. To identify ER agonists, BG1-Luc-4E2 cell line (provided by Dr. Michael Denison from University of California) has been used to screen the Tox21 10K compound library. BG1Luc4E2 cell line endogenously expresses full-length ER-alpha and is stably transfected with a plasmid containing four estrogen responsive elements (ERE) upstream of a luciferase reporter gene. (<a href='https://pubchem.ncbi.nlm.nih.gov/bioassay/743079'>Source</a>)

In [None]:
os.system("wget https://tripod.nih.gov/tox21/challenge/download?id=nr-ersmiles&sec=")

In [None]:
os.system("mv download?id=nr-ersmiles er.smiles")

In [None]:
df = pd.read_csv('er.smiles', delimiter='\t', names=['smiles', 'toxid', 'y'])
df

In [None]:
smiles = df['smiles'].values
y = df['y'].values
fps = []
ys = []
for idx, s in enumerate(smiles):
  try:
    m = Chem.MolFromSmiles(s)
    fp = AllChem.GetMorganFingerprintAsBitVect(m, 5)
    fps.append(fp)
    ys.append(y[idx])
  except:
    continue

X = np.stack(fps, axis=0)
y = np.array(ys)
print(X.shape, y.shape)

## Question 1: Data Visualization

Visualize the distribution of labels. Is there a class imbalance present in the dataset? Next, visualize some of the molecules using the provided helper function, `smiles_to_img()`.

In [None]:
def smiles_to_img(smiles):
  """ Converts smiles string to image

  Parameters:
  -----------
  smiles: string
    smiles string representing a molecule.

  Returns:
  --------
  None
  """

  m = Chem.MolFromSmiles(smiles)
  img = Draw.MolToImage(m)
  display(img)
  return None

## Question 2: Random Forest Classification and Hyperparameter Search

Split the dataset into 80% training and 20% testing. Construct a hyperparameter search for the hyperparameter `n_estimators` or number of decision trees. Use 5-fold cross-validation to find the optimal value for this parameter based on validation R2 score. Comment on how you decided on the maximum and minimum values for your hyperparameter search and how, if at all, the validation score depends on this hyperparameter. What is the performance on the test set?

Hint: Review `15_Gaussian_Process_and_Random_Forest_Regression.ipynb`



## Question 3: Feature Importance and Visualization

Determine the most important features of the dataset using the `RandomForestRegressor.feature_importances_` attribute. Each feature corresponds to a molecular functional group. Visualize the important functions groups for a few molecules using the helper function, `view_feature_importances()`.

In [None]:
def view_feature_importances(smiles, feature_importances):
  """ Converts smiles string to image and highlights important features

  Parameters:
  -----------
  smiles: string
    smiles string representing a molecule.
  feature_importances: list
    list of feature importances

  Returns:
  --------
  None
  """
  bitinfo = {}
  mol = Chem.MolFromSmiles(smiles)
  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 5, bitInfo=bitinfo)
  img = Draw.MolToImage(mol)
  display(img)  

  mfp = np.array(fp)
  sortidx = np.argsort(feature_importances)
  sorted_bits = []
  for i in sortidx[::-1]:
    if mfp[i] == 1: sorted_bits.append(i)
  for bit in sorted_bits[5:10]:
    img = Draw.DrawMorganBit(m, bit, bitinfo)
    display(img)
  return None