# Refining vectors in testing space

## Aims
1. Is there a better lens to look at the data with? It doesn't necessarily have to relate to the metric.
    1. We could use dimensionality reduction on the fingerprints e.g. PCA/MDS and then use 2D values of that to be the lens.
    1. There are problems with discretised lenses because they naturally fall into bins anyway and this complicates designing a cover (that's why yesterday I had to use lots of overlap)
1. What does the distance matrix mean? 
1. What is a meaningful thing to colour by? We could use "is active/inactive against target B"

## Pitfalls
1. Lots of drugs have been tested against only one target and therefore show up as identical in the metric. Is there a better way to do that?


In [75]:
import rdkit
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem
import kmapper as km
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from collections import Counter
import sys

In [76]:
ACTIVITY_CUTOFF = 5.0
DESIRED_TARGET = "CHEMBL218"
DESIRED_TARGET2 = "CHEMBL4015"

In [77]:
with open("../data/processed/curated_set_with_publication_year.pd.pkl", "rb") as infile:
    df = pickle.load(infile)

possible_targets = Counter([item for item in df["TGT_CHEMBL_ID"]])
print(possible_targets.most_common(50))
possible_drugs = Counter([item for item in df["CMP_CHEMBL_ID"]])
vector_df = pd.DataFrame(0, columns=possible_drugs.keys(), index=possible_targets.keys(), dtype=np.int8)

[('CHEMBL240', 4703), ('CHEMBL253', 3472), ('CHEMBL218', 2997), ('CHEMBL251', 2976), ('CHEMBL228', 2853), ('CHEMBL264', 2548), ('CHEMBL226', 2544), ('CHEMBL217', 2473), ('CHEMBL344', 2358), ('CHEMBL243', 2315), ('CHEMBL256', 2304), ('CHEMBL205', 2257), ('CHEMBL279', 2142), ('CHEMBL261', 2089), ('CHEMBL4235', 2020), ('CHEMBL244', 2010), ('CHEMBL222', 2003), ('CHEMBL233', 1998), ('CHEMBL4078', 1994), ('CHEMBL284', 1950), ('CHEMBL237', 1908), ('CHEMBL259', 1828), ('CHEMBL4822', 1799), ('CHEMBL3371', 1773), ('CHEMBL214', 1703), ('CHEMBL313', 1690), ('CHEMBL3594', 1678), ('CHEMBL203', 1659), ('CHEMBL224', 1643), ('CHEMBL4296', 1594), ('CHEMBL260', 1589), ('CHEMBL235', 1575), ('CHEMBL234', 1569), ('CHEMBL225', 1565), ('CHEMBL236', 1550), ('CHEMBL220', 1542), ('CHEMBL238', 1518), ('CHEMBL247', 1474), ('CHEMBL255', 1445), ('CHEMBL3952', 1424), ('CHEMBL2039', 1403), ('CHEMBL340', 1386), ('CHEMBL3242', 1380), ('CHEMBL204', 1347), ('CHEMBL5071', 1332), ('CHEMBL239', 1324), ('CHEMBL325', 1298), ('

In [78]:
counted = 0
for index, row in df.iterrows():
    drug = row["CMP_CHEMBL_ID"]
    target = row["TGT_CHEMBL_ID"]
    if not counted % 100001:
        print("Counted up to", counted)
    if row["BIOACT_PCHEMBL_VALUE"] > ACTIVITY_CUTOFF:
        vector_df[drug][target] = 1
    else:
        vector_df[drug][target] = -1
    counted += 1

Counted up to 0
Counted up to 100001
Counted up to 200002
Counted up to 300003


In [79]:
def dissimilarity(vec1, vec2, metric="euclidean"):
    """
    Takes in two vectors with values +1 and -1. Computes how far away they are
    in according to the metric.
    Current metrics:
    - Euclidean, computes sqrt(vec1 dot vec2)
    - Cosine distance
    - Tanimoto
    """
    metric = metric.lower()
    if metric == "euclidean":
        distance = np.abs(vec1 - vec2)
        return np.sqrt(np.dot(distance, distance))
    
    if metric == "cosine":
        return 1.0 - np.dot(vec1, vec2)/(np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2)))
                                   
    if metric == "tanimoto":
        return 1.0 - np.dot(vec1, vec2) / (np.dot(vec1, vec1) + np.dot(vec2, vec2) - np.dot(vec1, vec2))

In [89]:
mask = vector_df.loc[DESIRED_TARGET].values != 0
mask2 = vector_df.loc[DESIRED_TARGET2].values != 0
sub_df = (vector_df.T[mask | mask2])
sub_df.shape

(4231, 1227)

In [90]:
count = 0
distance_matrix = np.zeros([len(sub_df), len(sub_df)])
with open("distance_matrix.csv", "w") as fi:
    for drug_index in range(len(sub_df)):
        if not drug_index % 100:
            print(drug_index)
        drug = sub_df.iloc[drug_index].values
        for other_index in range(drug_index):
            other_drug = sub_df.iloc[other_index].values
            distance = dissimilarity(drug, other_drug, "tanimoto")
            distance_matrix[drug_index, other_index] = distance
            distance_matrix[other_index, drug_index] = distance

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200


In [93]:
first_target = (df["TGT_CHEMBL_ID"] == DESIRED_TARGET) | (df["TGT_CHEMBL_ID"] == DESIRED_TARGET2)
fingerprint_df = df[first_target]

In [94]:


try:
    fingerprint_df.insert(0, "FINGERPRINT",[AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles),3) for smiles in fingerprint_df["SMILES"]])
except ValueError:
    # If we re-run this cell, we can't reinsert the data (so instead we just replace it)
    fingerprint_df.loc["FINGERPRINT"] = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles),3) for smiles in fingerprint_df["SMILES"]]
    
fingerprint_data = []
for index, series in fingerprint_df.iterrows():
    fingerprint_data.append(series["FINGERPRINT"])

In [95]:
sq_distance_matrix = np.zeros([len(fingerprint_data), len(fingerprint_data)])
for row in range(len(fingerprint_data)):
    fingerprint = fingerprint_data[row]
    for col in range(row):
        other = fingerprint_data[col]
        dissimiliarity = 1.0 - rdkit.DataStructs.TanimotoSimilarity(fingerprint, other)
        sq_distance_matrix[row, col] = dissimiliarity
        sq_distance_matrix[col, row] = dissimiliarity

In [96]:
from sklearn.manifold import MDS
print(sq_distance_matrix.shape)
lens = MDS(n_components=2, dissimilarity="precomputed", metric=False).fit_transform(sq_distance_matrix)

(4231, 4231)


In [102]:
print([sub_df.T[drug][DESIRED_TARGET] for drug in sub_df.T])
color_func = np.array([np.sum(np.abs(sub_df.T[drug].values)) for drug in sub_df.T])
color_func =color_func / np.max(color_func)
color_func

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 

array([1.    , 0.125 , 0.125 , ..., 0.0625, 0.0625, 0.0625])

In [99]:
import hdbscan
from IPython.display import SVG, IFrame
mapper = km.KeplerMapper(verbose=1)
graph = mapper.map(lens,
                   X=distance_matrix,
                   precomputed=True,
                   cover=km.Cover(n_cubes=[8, 8], perc_overlap=0.25),
                   clusterer=hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=5, min_samples=3))
mapper.visualize(graph, path_html="2019-04-05-mb-vectors-in-drug-testing-space-mds.html",
                 title="Vectors in Drug Testing Space With MDS", color_function=color_func)
IFrame("2019-04-05-mb-vectors-in-drug-testing-space-mds.html", 800, 600)

KeplerMapper(verbose=1)
Mapping on data shaped (4231, 4231) using lens shaped (4231, 2)

Creating 64 hypercubes.

Created 775 edges and 263 nodes in 0:00:00.686550.
Wrote visualization to: 2019-04-05-mb-vectors-in-drug-testing-space-mds.html
