# Random Forest Classifiers
Today I am going to try to implement a random forest classifier on a dataset of activities vs a specific compound.
I will split the data into "Active/Inactive" classes and then use Morgan Fingerprints as the input for a random forest classifier.

## Aims
1. Get a classifier with reasonable accuracy

## Pitfalls
1. Does it even work with boolean vectors?

## Todo
1. Once the model is built, use the Fibres of Failure Approach to examine it.
2. Use a radnom forest regressor to get actual activity data?



In [1]:
import pickle
import sys

import scipy

import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit.Chem import DataStructs
import numpy as np
import pandas as pd

from collections import Counter

import sklearn.ensemble


Here are the hyper-parameters selecting activity cutoffs and which target we wish to look at.

In [105]:
ACTIVITY_CUTOFF = 5.0
DESIRED_TARGETS = ["CHEMBL240"]
MAPPER_TARGETS = ["CHEMBL240", "CHEMBL264"]
FP_SIZE = 2048
VALIDATE_BY_YEAR = False
if VALIDATE_BY_YEAR:
    YEAR_CUTOFF = 2013
else:
    VALIDATE_FRACTION = 0.15

These calculate how good the classifier is

In [68]:
def accuracy(predictions, ground_truth):
    """
    Calculates the accuracy of a given prediction, using the formula
    $Acc = \frac{\sum \text{True Positives} + \sum \text{True Negatives}}{\sum \text{Positives} + \sum \text{Negatives}}$
    Takes in two numpy array-likes
    """
    true_positives = np.sum(np.logical_and(predictions, ground_truth))
    true_negatives = np.sum(np.logical_and(np.logical_not(predictions), np.logical_not(ground_truth)))
    size = predictions.shape[0]
    return (true_positives + true_negatives) / size

In [69]:
def sensitivity(predictions, ground_truth):
    """
    Calculates the sensitivity of a given prediction, using the formula
    $Acc = \frac{\sum \text{True Positives}}{\sum \text{True Positives} + \sum \text{False Positives}}$
    Takes in two numpy array-likes
    """
    true_positives = np.sum(np.logical_and(predictions, ground_truth))
    false_positives = np.sum(np.logical_and(np.logical_not(predictions), ground_truth))
    return true_positives / (true_positives + false_negatives)

In [70]:
def specificity(predictions, ground_truth):
    """
    Calculates the specificity of a given prediction, using the formula
    $Acc = \frac{\sum \text{True Negatives}}{\sum \text{False Positives} + \sum \text{True Negatives}}$
    Takes in two numpy array-likes
    """
    true_positives = np.sum(np.logical_and(predictions, ground_truth))
    true_negatives = np.sum(np.logical_and(np.logical_not(predictions), np.logical_not(ground_truth)))
    false_positives = np.sum(np.logical_and(predictions, np.logical_not(ground_truth)))
    return (true_negatives) / (false_positives + true_negatives)

In [109]:
def dissimilarity(vec1, vec2, metric="euclidean"):
    """
    Takes in two vectors with values +1 and -1. Computes how far away they are
    in according to the metric.
    Current metrics:
    - Euclidean, computes sqrt(vec1 dot vec2)
    - Cosine distance
    - Tanimoto
    """
    metric = metric.lower()
    if metric == "euclidean":
        distance = np.abs(vec1 - vec2)
        return np.sqrt(np.dot(distance, distance))
    
    if metric == "cosine":
        return 1.0 - np.dot(vec1, vec2)/(np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))
                                   
    if metric == "tanimoto":
        return 1.0 - np.dot(vec1, vec2) / (np.dot(vec1, vec1) + np.dot(vec2, vec2) - np.dot(vec1, vec2))

In [71]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

possible_targets = Counter([item for item in df["TGT_CHEMBL_ID"]])
possible_drugs = Counter([item for item in df["CMP_CHEMBL_ID"]])

In [72]:
df = df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in DESIRED_TARGETS])]
if VALIDATE_BY_YEAR:
    training_df = df[df["DOC_YEAR"] < YEAR_CUTOFF]
    validation_df = df[df["DOC_YEAR"] >= YEAR_CUTOFF]
else:
    df = sklearn.utils.shuffle(df)
    split_point = int(df.shape[0] * VALIDATE_FRACTION)
    training_df = df.iloc[split_point:, :]
    validation_df = df.iloc[:split_point, :]

print(training_df.shape)
print(validation_df.shape)

(3998, 33)
(705, 33)


In [73]:
def convert_to_sparse(input_df, use_classes=True):
    n_samples = input_df.shape[0]
    print(n_samples)
    arr = np.empty([n_samples, FP_SIZE], dtype=bool)
    if use_classes:
        is_active = np.empty([n_samples], dtype=bool)
    else:
        is_active = np.empty([n_samples], dtype=np.float64)
    for index, (item, row) in enumerate(input_df.iterrows()):
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                  radius=3,
                                                                  nBits=FP_SIZE)
        DataStructs.ConvertToNumpyArray(fingerprint, arr[index, :])
        if use_classes:
            if row["BIOACT_PCHEMBL_VALUE"] < ACTIVITY_CUTOFF:
                is_active[index] = False
            else:
                is_active[index] = True
        else:
            is_active[index] = row["BIOACT_PCHEMBL_VALUE"]

    observations = scipy.sparse.csc_matrix(arr)
    return observations, is_active

In [74]:
training_observations, training_is_active = convert_to_sparse(training_df)
validation_observations, validation_is_active = convert_to_sparse(validation_df)

3998
705


How much does the n_estimators parameter actually matter?
Answer: 1024 seems to be just fine.

In [79]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=512, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.7971631205673759

In [80]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=1024, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.8

In [81]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=2048, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.7957446808510639

In [82]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=4096, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.8

How well do other classifiers fit the data? ExtraTrees is good in the case of few important features and lots of noisy features, but equivalent otherwise.

Answer: it scores pretty similarly, within the margin of error.

In [78]:
model = sklearn.ensemble.ExtraTreesClassifier(n_estimators=2048, criterion="gini", n_jobs=4, bootstrap=False, max_features="log2")
model.fit(training_observations, training_is_active)
model.score(validation_observations, validation_is_active)

0.8

In [55]:
predictions = model.predict(validation_observations)

In [56]:
print("Accuracy =", accuracy(predictions, validation_is_active))
print("Sensitivity =", sensitivity(predictions, validation_is_active))
print("Specificity =", specificity(predictions, validation_is_active))

Accuracy = 0.7680851063829788
Sensitivity = 0.7848101265822784
Specificity = 0.375


In [86]:
regressor = sklearn.ensemble.RandomForestRegressor(n_estimators=1024, criterion="mse", verbose=1, n_jobs=4, bootstrap=True)

In [84]:
training_observations, training_is_active = convert_to_sparse(training_df, use_classes=False)
validation_observations, validation_is_active = convert_to_sparse(validation_df, use_classes=False)


3998
705


Can we get useful regression information? (Warning: Takes 6 minutes, and the answer is "no")

In [87]:
regressor.fit(training_observations, training_is_active)
regressor.score(validation_observations, validation_is_active)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 1024 out of 1024 | elapsed:  6.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1024 out of 1024 | elapsed:    0.2s finished


0.5116423089924126

In [88]:
with open("vector_df.pkl", "rb") as fi:
    vector_df = pickle.load(fi)

In [92]:
max_drugs_shared = 0
for index in range(len(vector_df.index)):
    if vector_df.iloc[index].name == "CHEMBL240":
        print("Looking at Chembl 240")
        print(index)
    drugs = np.abs(vector_df.iloc[index].values)
    if (index % 50 == 0):
        print(index, max_drugs_shared)
    for other_index in range(index):
        other_drugs = np.abs(vector_df.iloc[other_index].values)
        drugs_shared = np.sum(np.logical_and(drugs.astype(bool), other_drugs.astype(bool)))
        if drugs_shared > max_drugs_shared:
            max_drugs_shared = drugs_shared
            max_coords = (index, other_index)

0 0
50 631
100 1908
150 1908
Looking at Chembl 240
165
200 1989
250 1989
300 1989
350 1989
400 1989
450 1989
500 1989
550 1989
600 1989
650 1989
700 1989
750 1989


KeyboardInterrupt: 

In [101]:
max_drugs_shared = 0
chembl_240_drugs = np.abs(vector_df.iloc[165].values)
for index in range(len(vector_df.index)):
    if index == 165:
        continue
    other_drugs = np.abs(vector_df.iloc[index].values)
    drugs_shared = np.sum(np.logical_and(chembl_240_drugs.astype(bool), other_drugs.astype(bool)))
    if drugs_shared > max_drugs_shared:
        max_drugs_shared = drugs_shared
        max_coords = (165, index)

In [102]:
print(max_coords, max_drugs_shared)

(165, 142) 352


In [103]:
print(vector_df.iloc[142].name)

CHEMBL264


In [106]:
for index, series in df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in MAPPER_TARGETS])].iterrows():
    molec = Chem.MolFromSmiles(series["SMILES"])
    chembl_id = series["CMP_CHEMBL_ID"]
    rdDepictor.Compute2DCoords(molec)
    drawer = rdMolDraw2D.MolDraw2DSVG(250, 250)
    drawer.DrawMolecule(molec)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    with open(f"./Figures/{chembl_id}.svg", "w") as svgfile:
        svgfile.write(svg)

NameError: name 'rdDepictor' is not defined

In [107]:
sub_df = vector_df.T[np.logical_or.reduce([vector_df.loc[tgt].values != 0 for tgt in MAPPER_TARGETS])]

In [None]:
count = 0
distance_matrix = np.zeros([len(sub_df), len(sub_df)])
for drug_index in range(len(sub_df)):
    if not drug_index % 100:
        print(drug_index)
    drug = sub_df.iloc[drug_index].values
    for other_index in range(drug_index):
        other_drug = sub_df.iloc[other_index].values
        distance = dissimilarity(drug, other_drug, "tanimoto")
        distance_matrix[drug_index, other_index] = distance
        distance_matrix[other_index, drug_index] = distance

0




100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
