# Random Forest Classifiers
Today I am going to try to implement a random forest classifier on a dataset of activities vs a specific compound.
I will split the data into "Active/Inactive" classes and then use Morgan Fingerprints as the input for a random forest classifier.

## Aims
1. Get a classifier with reasonable accuracy

## Pitfalls
1. Does it even work with boolean vectors?

## Todo
1. Once the model is built, use the Fibres of Failure Approach to examine it.
2. Use a radnom forest regressor to get actual activity data?



In [2]:
import pickle
import sys

import scipy

import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import DataStructs
import numpy as np
import pandas as pd

from collections import Counter

import sklearn.ensemble


Here are the hyper-parameters selecting activity cutoffs and which target we wish to look at.

In [3]:
ACTIVITY_CUTOFF = 5.0
DESIRED_TARGETS = ["CHEMBL240"]
MAPPER_TARGETS = ["CHEMBL240", "CHEMBL264"]
FP_SIZE = 2048
RANDOM_STATE = 2019
VALIDATE_BY_YEAR = False
if VALIDATE_BY_YEAR:
    YEAR_CUTOFF = 2013
else:
    VALIDATE_FRACTION = 0.05

These calculate how good the classifier is

In [4]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

possible_targets = Counter([item for item in df["TGT_CHEMBL_ID"]])
possible_drugs = Counter([item for item in df["CMP_CHEMBL_ID"]])

In [5]:
vector_df = pd.DataFrame(0, columns=possible_drugs.keys(), index=possible_targets.keys(), dtype=np.int8)
counted = 0
fingerprint_dict = {}
for index, row in df.iterrows():
    drug = row["CMP_CHEMBL_ID"]
    target = row["TGT_CHEMBL_ID"]
    if target in MAPPER_TARGETS:
        try:
            if not fingerprint_dict[drug]:
                fingerprint_dict[drug] = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                               radius=3,
                                                                               nBits=FP_SIZE)
        except KeyError:
            fingerprint_dict[drug] = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                               radius=3,
                                                                               nBits=FP_SIZE)
    if not counted % 10000:
        print("Counted up to", counted)
    if row["BIOACT_PCHEMBL_VALUE"] > ACTIVITY_CUTOFF:
        vector_df[drug][target] = 1
    else:
        vector_df[drug][target] = -1
    counted += 1

Counted up to 0
Counted up to 10000
Counted up to 20000
Counted up to 30000
Counted up to 40000
Counted up to 50000
Counted up to 60000
Counted up to 70000
Counted up to 80000
Counted up to 90000
Counted up to 100000
Counted up to 110000
Counted up to 120000
Counted up to 130000
Counted up to 140000
Counted up to 150000
Counted up to 160000
Counted up to 170000
Counted up to 180000
Counted up to 190000
Counted up to 200000
Counted up to 210000
Counted up to 220000
Counted up to 230000
Counted up to 240000
Counted up to 250000
Counted up to 260000
Counted up to 270000
Counted up to 280000
Counted up to 290000
Counted up to 300000
Counted up to 310000


In [8]:
sub_df = df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in DESIRED_TARGETS])]
if VALIDATE_BY_YEAR:
    training_df = sub_df[sub_df["DOC_YEAR"] < YEAR_CUTOFF]
    validation_df = sub_df[sub_df["DOC_YEAR"] >= YEAR_CUTOFF]
else:
    sub_df = sklearn.utils.shuffle(sub_df, random_state=RANDOM_STATE)
    split_point = int(sub_df.shape[0] * VALIDATE_FRACTION)
    training_df = sub_df.iloc[split_point:, :]
    validation_df = sub_df.iloc[:split_point, :]

print(training_df.shape)
print(validation_df.shape)

(4468, 33)
(235, 33)


In [9]:
def convert_to_sparse(input_df, use_classes=True):
    n_samples = input_df.shape[0]
    print(n_samples)
    arr = np.empty([n_samples, FP_SIZE], dtype=bool)
    if use_classes:
        is_active = np.empty([n_samples], dtype=bool)
    else:
        is_active = np.empty([n_samples], dtype=np.float64)
    for index, (item, row) in enumerate(input_df.iterrows()):
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                  radius=3,
                                                                  nBits=FP_SIZE)
        DataStructs.ConvertToNumpyArray(fingerprint, arr[index, :])
        if use_classes:
            if row["BIOACT_PCHEMBL_VALUE"] < ACTIVITY_CUTOFF:
                is_active[index] = False
            else:
                is_active[index] = True
        else:
            is_active[index] = row["BIOACT_PCHEMBL_VALUE"]

    observations = scipy.sparse.csc_matrix(arr)
    return observations, is_active

In [10]:
training_observations, training_is_active = convert_to_sparse(training_df)
validation_observations, validation_is_active = convert_to_sparse(validation_df)

4468
235


How much does the n_estimators parameter actually matter?
Answer: 1024 seems to be just fine.

In [11]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=1024, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.8170212765957446

In [12]:
predictions = model.predict(validation_observations)
print(predictions)

[ True  True  True False  True  True  True  True  True False  True False
 False False  True  True  True  True False  True  True  True  True  True
 False  True False  True  True  True False  True False  True  True  True
  True  True  True  True  True  True  True  True False  True  True  True
  True  True  True False  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True False
 False  True  True  True  True False  True  True  True  True False False
 False  True False  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True False  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True False False  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True Fa

In [84]:
for index, series in df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in MAPPER_TARGETS])].iterrows():
    molec = Chem.MolFromSmiles(series["SMILES"])
    chembl_id = series["CMP_CHEMBL_ID"]
    rdDepictor.Compute2DCoords(molec)
    drawer = rdMolDraw2D.MolDraw2DSVG(250, 250)
    drawer.DrawMolecule(molec)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    with open(f"./Figures/{chembl_id}.svg", "w") as svgfile:
        svgfile.write(svg)

In [13]:
sub_vec_df = vector_df.T[np.logical_or.reduce([vector_df.loc[tgt].values != 0 for tgt in MAPPER_TARGETS])]

In [14]:
def dissimilarity(vec1, vec2, metric="euclidean"):
    """
    Takes in two vectors with values +1 and -1. Computes how far away they are
    in according to the metric.
    Current metrics:
    - Euclidean, computes sqrt(vec1 dot vec2)
    - Cosine distance
    - Tanimoto
    """
    metric = metric.lower()
    if metric == "euclidean":
        distance = np.abs(vec1 - vec2)
        return np.sqrt(np.dot(distance, distance))
    
    if metric == "cosine":
        return 1.0 - np.dot(vec1, vec2)/(np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))
                                   
    if metric == "tanimoto":
        return 1.0 - np.dot(vec1, vec2) / (np.dot(vec1, vec1) + np.dot(vec2, vec2) - np.dot(vec1, vec2))

In [87]:
count = 0
distance_matrix = np.zeros([len(sub_vec_df), len(sub_vec_df)])
for drug_index in range(len(sub_vec_df)):
    if not drug_index % 100:
        print(drug_index)
    drug = sub_vec_df.iloc[drug_index].values
    for other_index in range(drug_index):
        other_drug = sub_vec_df.iloc[other_index].values
        distance = dissimilarity(drug, other_drug, "tanimoto")
        distance_matrix[drug_index, other_index] = distance
        distance_matrix[other_index, drug_index] = distance
pickle.dump(distance_matrix, open("2019-04-15-distance-matrix.pkl", "wb"))

0




100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800


In [15]:
distance_matrix = pickle.load(open("2019-04-15-distance-matrix.pkl", "rb"))

In [17]:
chemical_distance = np.zeros_like(distance_matrix)
for index in range(len(sub_vec_df)):
    drug = sub_vec_df.iloc[index]
    fingerprint = fingerprint_dict[drug.name]
    if not index % 100:
        print(index)
    for other_index in range(index):
        other_fingerprint = fingerprint_dict[sub_vec_df.iloc[other_index].name]
        chem_dissimiliarity = 1.0 - rdkit.DataStructs.TanimotoSimilarity(fingerprint, other_fingerprint)
        chemical_distance[index, other_index] = chem_dissimiliarity
        chemical_distance[other_index, index] = chem_dissimiliarity
pickle.dump(chemical_distance, open("2019-04-15-chemical-distance.pkl", "wb"))

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800


In [16]:
chemical_distance= pickle.load(open("2019-04-15-chemical-distance.pkl", "rb"))
print(chemical_distance)

[[0.         0.87931034 0.94230769 ... 0.93548387 0.91869919 0.91358025]
 [0.87931034 0.         0.86813187 ... 0.88648649 0.9125     0.85789474]
 [0.94230769 0.86813187 0.         ... 0.88607595 0.91729323 0.88023952]
 ...
 [0.93548387 0.88648649 0.88607595 ... 0.         0.92537313 0.9127907 ]
 [0.91869919 0.9125     0.91729323 ... 0.92537313 0.         0.90070922]
 [0.91358025 0.85789474 0.88023952 ... 0.9127907  0.90070922 0.        ]]


In [17]:
from sklearn.manifold import MDS
lens = MDS(n_components=2, dissimilarity="precomputed", metric=False).fit_transform(chemical_distance)

In [18]:
mapper_df = df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in MAPPER_TARGETS])]
arr = np.empty([len(mapper_df), FP_SIZE])
for index in range(len(mapper_df)):
    drug = mapper_df.iloc[index]["CMP_CHEMBL_ID"]
    fp = fingerprint_dict[drug]
    DataStructs.ConvertToNumpyArray(fp, arr[index, :])

observations, _ = convert_to_sparse(mapper_df, use_classes=False)

7251


In [20]:
mapper_df

Unnamed: 0,BIOACT_PCHEMBL_VALUE,CMP_ACD_LOGD,CMP_ACD_LOGP,CMP_ALOGP,CMP_AROMATIC_RINGS,CMP_CHEMBL_ID,CMP_FULL_MWT,CMP_HBA,CMP_HBD,CMP_HEAVY_ATOMS,...,CMP_STANDARD_INCHI_KEY,CMP_STRUCTURE_TYPE,CMP_TYPE_PROTEIN,CMP_TYPE_SMALL_MOLECULE,DOC_YEAR,SMILES,TC_key,TGT_CHEMBL_ID,TGT_ORGANISM,TGT_TID
CHEMBL264 - CHEMBL106158,5.100,-0.49,-0.45,-0.26,1,CHEMBL106158,248.28,6,3,18,...,CCOQWVUQXNRKKP-WCBMZHEXSA-N,MOL,False,True,2010,C/N=C(\NC#N)NC[C@@H]1CC[C@H](c2c[nH]cn2)O1,CHEMBL264 - CHEMBL106158,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL178191,9.280,2.84,5.40,4.94,3,CHEMBL178191,376.49,3,0,28,...,ZQCABOOXQFMEIL-QGZVFWFLSA-N,MOL,False,True,2005,C[C@@H]1CCCN1CCc1cc2cc(C(=O)c3ccc(N(C)C)cc3)cc...,CHEMBL264 - CHEMBL178191,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL199187,9.370,2.72,5.32,4.05,3,CHEMBL199187,380.44,5,1,28,...,JGEZUVQRDLSMME-OAHLLOKOSA-N,MOL,False,True,2005,C[C@@H]1CCCN1CCc1cc2cc(CNc3ncccc3[N+](=O)[O-])...,CHEMBL264 - CHEMBL199187,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL351231,8.620,2.80,5.37,4.77,3,CHEMBL351231,330.42,2,0,25,...,KFHYZKCRXNRKRC-MRXNPFEDSA-N,MOL,False,True,2011,C[C@@H]1CCCN1CCc1cc2cc(-c3ccc(C#N)cc3)ccc2o1,CHEMBL264 - CHEMBL351231,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL180368,9.160,2.80,5.37,4.77,3,CHEMBL180368,330.42,2,0,25,...,KFHYZKCRXNRKRC-INIZCTEOSA-N,MOL,False,True,2005,C[C@H]1CCCN1CCc1cc2cc(-c3ccc(C#N)cc3)ccc2o1,CHEMBL264 - CHEMBL180368,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL1688971,7.150,2.38,2.84,2.68,2,CHEMBL1688971,259.35,3,1,19,...,DFXVOYQWLICYQI-UHFFFAOYSA-N,MOL,False,True,2011,c1nc(CCCn2cc(C3CCCCC3)nn2)c[nH]1,CHEMBL264 - CHEMBL1688971,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL1688981,6.100,2.32,2.90,2.83,1,CHEMBL1688981,305.46,4,0,22,...,ICSIZZWCCMPDKQ-UHFFFAOYSA-N,MOL,False,True,2011,CN1CCN(CCCn2cc(CC3CCCCC3)nn2)CC1,CHEMBL264 - CHEMBL1688981,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL240412,8.440,0.91,3.59,5.46,2,CHEMBL240412,436.65,4,0,31,...,ZJAKASJNDYDJES-UHFFFAOYSA-N,MOL,False,True,2007,CSc1ccc(C2CN3CCCC3c3cc(OCCCN4CCCCC4)ccc32)cc1,CHEMBL264 - CHEMBL240412,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL369502,9.680,3.11,5.69,5.39,3,CHEMBL369502,373.49,2,0,28,...,BQTMIBGMCRVQPU-QGZVFWFLSA-N,MOL,False,True,2005,C[C@@H]1CCCN1CCc1cc2cc(-c3cccc(C(=O)C4CC4)c3)c...,CHEMBL264 - CHEMBL369502,CHEMBL264,Homo sapiens,10280
CHEMBL264 - CHEMBL11919,7.600,-2.53,-0.35,-0.34,1,CHEMBL11919,125.17,2,2,9,...,XNQIOISZPFVUFG-YFKPBYRVSA-N,MOL,False,True,2010,C[C@H](N)Cc1c[nH]cn1,CHEMBL264 - CHEMBL11919,CHEMBL264,Homo sapiens,10280


In [27]:
print(np.array([vector_df[drug] for drug in vector_df]))

[[ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0 -1]
 [ 0  0  0 ...  0  0 -1]
 [ 0  0  0 ...  0  0 -1]]


In [18]:
predictions = model.predict(observations)
with np.printoptions(threshold=7252):
    print(predictions)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

In [23]:
import hdbscan
from IPython.display import SVG, IFrame
import kmapper as km
with np.printoptions(threshold=7000):
    print(predictions)

mapper = km.KeplerMapper(verbose=1)
graph = mapper.map(lens,
                   X=distance_matrix,
                   precomputed=True,
                   cover=km.Cover(n_cubes=[6, 6], perc_overlap=0.08),
                   clusterer=hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=10, min_samples=3))
mapper.visualize(graph, path_html="2019-04-15-mb-random-forests-2-map.html",
                 title="Vectors in Drug Testing Space With MDS", color_function=predictions)
IFrame("2019-04-15-mb-random-forests-2-map.html", 800, 600)

[ True  True  True ...  True  True  True]


KeyError: 'CHEMBL240'