# Tying It All Together

Today I will be trying to tie all of the disparate pieces of this project together.
## Aims
1. Get a simple TDA graph of all of the drugs tested against a certain target. Use MDS (and maybe activity) as the lens
2. Get a TDA graph of all the drugs tested against target A and target B. Show the links between them.
3. Use the 'vectors in drug testing space' against two targets to see the links, and tie this together with a predictor to get some useful results.
4. Make the FIFA presentable.



In [6]:
import pickle
import sys

import scipy

import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import DataStructs

from IPython.display import SVG, IFrame

import numpy as np
import pandas as pd

from collections import Counter

import hdbscan

import kmapper as km
import igraph

import sklearn.ensemble
from sklearn.manifold import MDS

Here are the hyper-parameters selecting activity cutoffs and which target we wish to look at. Note that validating by year is less accurate, presumably because chemistry changes and different styles of molecule are made.

In [7]:
ACTIVITY_CUTOFF = 5.0
DESIRED_TARGETS = ["CHEMBL240"]
MAPPER_TARGETS = ["CHEMBL240", "CHEMBL264"]
FP_SIZE = 2048
RANDOM_STATE = 2019
VALIDATE_BY_YEAR = False
if VALIDATE_BY_YEAR:
    YEAR_CUTOFF = 2013
else:
    TRAIN_RF_FRACTION = 0.60
    TRAIN_FIFA_FRACTION = 0.20
    VALIDATE_FRACTION = 1.0 - TRAIN_FIFA_FRACTION - TRAIN_RF_FRACTION

# Community detection hyperparameters.
# Discard any with too small a set of nodes,
# or too small a prediction error.
COMMUNITY_SIZE_CUTOFF = 3
COMMUNITY_ERROR_CUTOFF = 0.20
CORRECTION_STD_WARN = 0.10

In [8]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

possible_targets = Counter([item for item in df["TGT_CHEMBL_ID"]])
possible_drugs = Counter([item for item in df["CMP_CHEMBL_ID"]])

In [9]:
counted = 0
fingerprint_dict = {}
for index, row in df.iterrows():
    drug = row["CMP_CHEMBL_ID"]
    target = row["TGT_CHEMBL_ID"]
    if target in MAPPER_TARGETS or target in DESIRED_TARGETS:
        try:
            if not fingerprint_dict[drug]:
                fingerprint_dict[drug] = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                               radius=3,
                                                                               nBits=FP_SIZE)
        except KeyError:
            fingerprint_dict[drug] = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                               radius=3,
                                                                               nBits=FP_SIZE)
    counted += 1

In [5]:
sub_df = df[np.logical_or.reduce([df["TGT_CHEMBL_ID"] == tgt for tgt in DESIRED_TARGETS])]

if VALIDATE_BY_YEAR:
    training_df = sub_df[sub_df["DOC_YEAR"] < YEAR_CUTOFF]
    validation_df = sub_df[sub_df["DOC_YEAR"] >= YEAR_CUTOFF]
else:
    sub_df = sklearn.utils.shuffle(sub_df, random_state=RANDOM_STATE)
    rf_split_point = int(sub_df.shape[0] * TRAIN_RF_FRACTION)
    fifa_split_point = int(sub_df.shape[0] * (TRAIN_RF_FRACTION + TRAIN_FIFA_FRACTION))
    validation_split_point = int(sub_df.shape[0] * (1.0 - VALIDATE_FRACTION))
    rf_training_df = sub_df.iloc[:rf_split_point, :]
    fifa_training_df = sub_df.iloc[:fifa_split_point, :]
    validation_df = sub_df.iloc[validation_split_point:, :]

print(rf_training_df.shape)
print(fifa_training_df.shape)
print(validation_df.shape)


(2821, 33)
(3762, 33)
(941, 33)


In [21]:
def convert_to_sparse(input_df, use_classes=True):
    n_samples = input_df.shape[0]
    print(n_samples)
    arr = np.empty([n_samples, FP_SIZE], dtype=bool)
    if use_classes:
        is_active = np.empty([n_samples], dtype=bool)
    else:
        is_active = np.empty([n_samples], dtype=np.float64)
    for index, (item, row) in enumerate(input_df.iterrows()):
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row["SMILES"]),
                                                                  radius=3,
                                                                  nBits=FP_SIZE)
        DataStructs.ConvertToNumpyArray(fingerprint, arr[index, :])
        if use_classes:
            if row["BIOACT_PCHEMBL_VALUE"] < ACTIVITY_CUTOFF:
                is_active[index] = False
            else:
                is_active[index] = True
        else:
            is_active[index] = row["BIOACT_PCHEMBL_VALUE"]

    observations = scipy.sparse.csc_matrix(arr)
    return observations, is_active

In [22]:
compound_names = [row["CMP_CHEMBL_ID"] for _, row in sub_df.iterrows()]

In [34]:
rf_training_observations, rf_training_is_active = convert_to_sparse(rf_training_df)
fifa_training_observations, fifa_training_is_active = convert_to_sparse(fifa_training_df)
validation_observations, validation_is_active = convert_to_sparse(validation_df)

1881
3762
941


In [10]:
import shutil
for fig in [f"./Figures/{chembl_id}.svg" for chembl_id in fifa_training_df["CMP_CHEMBL_ID"]]:
    shutil.copy2(fig, "./NewFigures/")


IsADirectoryError: [Errno 21] Is a directory: './NewFigures/'

How much does the n_estimators parameter actually matter?
Answer: 1024 seems to be just fine.

In [36]:
model = sklearn.ensemble.RandomForestClassifier(n_estimators=512, criterion="gini", n_jobs=4, random_state=RANDOM_STATE)
model.fit(rf_training_observations, rf_training_is_active)
model.score(validation_observations, validation_is_active)

0.7651434643995749

In [37]:
predictions = model.predict(fifa_training_observations)
probabilities = model.predict_proba(fifa_training_observations)
print(probabilities[100])

[0.05078125 0.94921875]


In [38]:
total_is_active = np.concatenate([rf_training_is_active, fifa_training_is_active, validation_is_active])
probabilities_ground_truth = np.empty(probabilities.shape[0])
probabilities_predicted = np.empty(probabilities.shape[0])

for i in range(len(probabilities)):
    is_active = int(total_is_active[i])

    probabilities_ground_truth[i] = probabilities[i][is_active]
    probabilities_predicted[i] = max(probabilities[i])
    
print(probabilities_ground_truth, probabilities_predicted)

[0.72851562 0.92578125 0.92578125 ... 0.81054688 0.62890625 0.03125   ] [0.72851562 0.92578125 0.92578125 ... 0.81054688 0.62890625 0.96875   ]


Now we have partially constructed the lens, we need to get the distances in
chemical space that we will map over.

In [94]:
chemical_distance = np.zeros([len(fifa_training_df), len(fifa_training_df)])
for index in range(len(fifa_training_df)):
    drug = fifa_training_df.iloc[index]["CMP_CHEMBL_ID"]
    fingerprint = fingerprint_dict[drug]
    if not index % 100:
        print(index)
    for other_index in range(index):
        other_fingerprint = fingerprint_dict[fifa_training_df.iloc[other_index]["CMP_CHEMBL_ID"]]
        chem_dissimiliarity = 1.0 - rdkit.DataStructs.TanimotoSimilarity(fingerprint, other_fingerprint)
        chemical_distance[index, other_index] = chem_dissimiliarity
        chemical_distance[other_index, index] = chem_dissimiliarity
pickle.dump(chemical_distance, open("2019-04-18-fifa-chemical-distance.pkl", "wb"))

0
100


KeyboardInterrupt: 

In [27]:
chemical_distance= pickle.load(open("2019-04-18-fifa-chemical-distance.pkl", "rb"))
print(chemical_distance)

[[0.         0.90972222 0.89261745 ... 0.89142857 0.89440994 0.88888889]
 [0.90972222 0.         0.90769231 ... 0.875      0.93103448 0.86923077]
 [0.89261745 0.90769231 0.         ... 0.90184049 0.89864865 0.88489209]
 ...
 [0.89142857 0.875      0.90184049 ... 0.         0.92134831 0.88484848]
 [0.89440994 0.93103448 0.89864865 ... 0.92134831 0.         0.91612903]
 [0.88888889 0.86923077 0.88489209 ... 0.88484848 0.91612903 0.        ]]


In [28]:
mds_component = MDS(n_components=1, dissimilarity="precomputed", metric=False).fit_transform(chemical_distance)

In [39]:
lens = np.empty([probabilities.shape[0], 4])
lens[:, 0] = fifa_training_is_active
lens[:, 1] = probabilities_ground_truth
lens[:, 2] = probabilities_predicted
lens[:, 3] = mds_component[:, 0]

got_it_right = np.logical_not(np.logical_xor(fifa_training_is_active, predictions))

In [40]:
custom_tooltips=np.array([f"<img src='./Figures/{chembl_id}.svg'>" for chembl_id in fifa_training_df["CMP_CHEMBL_ID"]])
mapper = km.KeplerMapper(verbose=1)
graph = mapper.map(lens,
                   X=chemical_distance,
                   precomputed=True,
                   cover=km.Cover(n_cubes=[2, 10, 10, 12], perc_overlap=[0.0, 0.05, 0.05, 0.45]),
                   clusterer=hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=3, min_samples=1))
mapper.visualize(graph, path_html="2019-04-18-mb-fibres-of-failure.html",
                 title="Testing out Fibres of Failure", color_function=probabilities_ground_truth, custom_tooltips=custom_tooltips)
IFrame("2019-04-18-mb-fibres-of-failure.html", 800, 600)

KeplerMapper(verbose=1)
Mapping on data shaped (3762, 3762) using lens shaped (3762, 4)

Creating 2400 hypercubes.

Created 929 edges and 805 nodes in 0:00:01.141797.
Wrote visualization to: 2019-04-18-mb-fibres-of-failure.html


Now we will convert the `KeplerMapper` graph into an `igraph` format, so we can analyse it for communities.

With the `igraph` communities, we then have to translate it back into `KeplerMapper` nodes for colouring.

In [41]:
g = igraph.Graph()
vertices = list(graph["nodes"].keys())
g.add_vertices(vertices)

edges = []
for link in graph["links"]:
    edges.extend([(link, otherlink) for otherlink in graph["links"][link]])
g.add_edges(edges)

In [42]:
communities = g.community_leading_eigenvector()#.as_clustering()

interesting_communities = []
for community in communities:
    if len(community) > 5:
        interesting_communities.append([g.vs[item]["name"] for item in community])


In [43]:
interesting_nodes = []
flattened_nodes = []
for community in interesting_communities:
    temp_i_n = [graph["nodes"][node] for node in community]
    interesting_nodes.append(temp_i_n)
    flattened_nodes.append(list(set([item for sublist in temp_i_n for item in sublist])))

In [44]:
colours = np.zeros(probabilities_ground_truth.shape[0])
for community in flattened_nodes:
    for node in community:
        colours[node] = 1.0

For each interesting community, calculate a prediction error. When we later classify a new compound,
add this prediction error factor to it.

The idea is that this will improve our predictions.

In [45]:
error_communities = []
error_corrections = [np.array([0.0, 0.0])]
for community in flattened_nodes:
    errors = []
    corrections = []
    for node in community:
        prediction_error = 1.0 - probabilities_ground_truth[node]
        errors.append(prediction_error)
        #print(total_is_active[node], predictions[node], probabilities_ground_truth[node], probabilities_predicted[node], prediction_error)
        if total_is_active[node]:
            corrections.append([-prediction_error, prediction_error])
        else:
            corrections.append([prediction_error, -prediction_error])
    corrections = np.array(corrections)
    correction_mean = np.mean(corrections, axis = 0)
    correction_std = np.std(corrections, axis = 0, ddof=1)[0]

    if abs(correction_mean[0]) > COMMUNITY_ERROR_CUTOFF:
        if correction_std > CORRECTION_STD_WARN:
            print("Warning: This community has a large standard deviation of prediction error. Consider tweaking the clustering algorithm")
        error_communities.append(community)
        error_corrections.append(correction_mean)
print(error_corrections)

[array([0., 0.]), array([ 0.97519531, -0.97519531]), array([ 0.82449777, -0.82449777]), array([ 0.69270126, -0.69270126]), array([-0.29639529,  0.29639529]), array([-0.27938088,  0.27938088]), array([-0.25991324,  0.25991324])]


In [46]:
colours = np.zeros(probabilities_ground_truth.shape[0])
for community in error_communities:
    for node in community:
        colours[node] = 1.0
mapper.visualize(graph, path_html="2019-04-18-mb-communities.html",
                 title="Visualising Community Detection", color_function=colours, custom_tooltips=custom_tooltips)
IFrame("2019-04-18-mb-communities.html", 800, 600)

Wrote visualization to: 2019-04-18-mb-communities.html


The final step is to build a classifier to see if a given compound falls into one of the failure modes.
We can use whatever classifier we want here and it should work. A class label is an int, 0 indicates
no error, and $ 1, 2, 3, \dotsc , n  $ indicate belonging to the $n$^th error class.

In [47]:
error_classes = np.zeros([probabilities_ground_truth.shape[0]], dtype=int)
for i in range(0, len(error_communities)):
    for node in error_communities[i]:
        error_classes[node] = i + 1

In [48]:
rf_error_classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=512, criterion="gini", n_jobs=4, random_state=RANDOM_STATE)
rf_error_classifier.fit(fifa_training_observations, error_classes)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1024, n_jobs=4,
            oob_score=False, random_state=2019, verbose=0,
            warm_start=False)

In [291]:
et_error_classifier = sklearn.ensemble.ExtraTreesClassifier(n_estimators=2048, criterion="gini", n_jobs=4, random_state=RANDOM_STATE)
et_error_classifier.fit(fifa_training_observations, error_classes)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2048, n_jobs=4,
           oob_score=False, random_state=2019, verbose=0, warm_start=False)

In [424]:
def tanimoto_kernel(X, Y):
    print(np.shape(X), np.shape(Y))
    X = X.toarray()
    Y = Y.toarray()
    shared_11 = np.dot(X, Y.T)
    x_11 = np.dot(X, X.T)
    y_11 = np.dot(Y, Y.T)
    print(shared_11.shape, x_11.shape, y_11.shape)
    print(shared_11, (x_11 + y_11 - shared_11))
    print(shared_11 / (x_11 + y_11 - shared_11))
    return shared_11 / (x_11 + y_11 - shared_11)

In [58]:
svc_error_classifier = sklearn.svm.SVC(C=1.0, gamma="scale", kernel="precomputed", random_state=RANDOM_STATE)
svc_error_classifier.fit(chemical_distance, error_classes)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale',
  kernel='precomputed', max_iter=-1, probability=False, random_state=2019,
  shrinking=True, tol=0.001, verbose=False)

In [272]:
logreg_classifier = sklearn.linear_model.LogisticRegression(C=1000, solver='lbfgs', multi_class='multinomial', max_iter=10000)
logreg_classifier.fit(fifa_training_observations, error_classes)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [376]:
knn_error_classifier = sklearn.neighbors.KNeighborsClassifier(metric="precomputed", n_neighbors=15, weights="uniform")
knn_error_classifier.fit(chemical_distance, error_classes)       

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='precomputed',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform')

In [60]:
total_distances = []
for i, observation in enumerate(validation_observations):
    fp = (observation.todense()).T
    bitstring = "".join([str(int(item[0][0])) for item in fp])
    arr = np.empty(FP_SIZE, dtype=bool)
    rdkit_fp = DataStructs.cDataStructs.CreateFromBitString(bitstring)

    this_distances = []
    if (i % 10 == 0):
        print("Counted", i, "out of", validation_observations.shape[0])
    for index in range(len(fifa_training_df)):
        drug = fifa_training_df.iloc[index]["CMP_CHEMBL_ID"]
        other_fp = fingerprint_dict[drug]
        distance = rdkit.DataStructs.TanimotoSimilarity(rdkit_fp, other_fp)
        this_distances.append(distance)
    total_distances.append(this_distances)

total_distances = np.array(total_distances)

Counted 0 out of 941
Counted 10 out of 941
Counted 20 out of 941
Counted 30 out of 941
Counted 40 out of 941
Counted 50 out of 941
Counted 60 out of 941
Counted 70 out of 941
Counted 80 out of 941
Counted 90 out of 941
Counted 100 out of 941
Counted 110 out of 941
Counted 120 out of 941
Counted 130 out of 941
Counted 140 out of 941
Counted 150 out of 941
Counted 160 out of 941
Counted 170 out of 941
Counted 180 out of 941
Counted 190 out of 941
Counted 200 out of 941
Counted 210 out of 941
Counted 220 out of 941
Counted 230 out of 941
Counted 240 out of 941
Counted 250 out of 941
Counted 260 out of 941
Counted 270 out of 941
Counted 280 out of 941
Counted 290 out of 941
Counted 300 out of 941
Counted 310 out of 941
Counted 320 out of 941
Counted 330 out of 941
Counted 340 out of 941
Counted 350 out of 941
Counted 360 out of 941
Counted 370 out of 941
Counted 380 out of 941
Counted 390 out of 941
Counted 400 out of 941
Counted 410 out of 941
Counted 420 out of 941
Counted 430 out of 941

In [62]:
print(total_distances.shape)
pickle.dump(total_distances, open("2019-04-23-fifa-validation-distance.pkl", "wb"))

(941, 3762)


In [63]:
predicted_error_classes = svc_error_classifier.predict(total_distances)
new_corrections = []
for error_class in predicted_error_classes:
    new_corrections.append(error_corrections[error_class])
new_corrections = np.array(new_corrections)

In [64]:
new_probabilities = model.predict_proba(validation_observations)
new_corrected_probabilities = new_probabilities + new_corrections
new_corrected_probabilities = np.clip(new_corrected_probabilities, 0.0, 1.0)

In [65]:
predicted_classes = np.empty([new_probabilities.shape[0]], dtype=bool)
corrected_predicted_classes = np.empty([new_probabilities.shape[0]], dtype=bool)
for i in range(len(new_probabilities)):
    pred_class = bool(np.argmax(new_probabilities[i, :]))
    predicted_classes[i] = pred_class    

    corrected_pred_class = bool(np.argmax(new_corrected_probabilities[i, :]))
    corrected_predicted_classes[i] = corrected_pred_class
    if new_corrections[i, 0]:
        print(new_corrections[i])
        print("GT:", validation_is_active[i], ", Uncorrected:", pred_class, ", Corrected:", corrected_pred_class)
    
got_it_right = np.logical_not(np.logical_xor(predicted_classes, validation_is_active))
got_it_right_corrected = np.logical_not(np.logical_xor(corrected_predicted_classes, validation_is_active))
print("Without corrections: ", np.sum(got_it_right), got_it_right.shape[0])
print("With corrections: ", np.sum(got_it_right_corrected), got_it_right.shape[0])

Without corrections:  720 941
With corrections:  720 941
